In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
data = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

In [5]:
memory_usage = data.memory_usage(deep=True) / 1024 ** 2
memory_usage.head(7)

In [6]:
memory_usage.sum()

In [7]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

reduced_df = reduce_memory_usage(data, verbose=True)

In [8]:
reduced_df.describe()

In [9]:
sample_df = reduced_df.sample(int(len(reduced_df) * 0.2))
sample_df.shape

sample_df = sample_df.drop(['id'], axis=1)

In [10]:
# Let's confirm if the sampling is retaining the feature distributions

fig, ax = plt.subplots(figsize=(6, 4))

sns.histplot(
    data=reduced_df, x="f6", label="Original data", color="red", alpha=0.3, bins=15
)
sns.histplot(
    data=sample_df, x="f6", label="Sample data", color="green", alpha=0.3, bins=15
)

plt.legend()
plt.show();

In [11]:
f, ax = plt.subplots(figsize=(8, 6))
corr = reduced_df.iloc[:,:20].corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

In [12]:
fig = plt.figure(figsize = (15, 50))
for i in range(len(sample_df.columns.tolist()[:100])):
    plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(sample_df.columns.tolist()[:100][i], size = 12, fontname = 'monospace')
    a = sns.kdeplot(sample_df[sample_df.columns.tolist()[:100][i]], color = '#1a5d57', shade = True, alpha = 0.9, linewidth = 1.5, edgecolor = 'black')
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.show()

In [13]:
# import statsmodels.api as sm
# from statsmodels.formula.api import ols

# all_columns = "+".join(sample_df.columns[:-1])
# my_formula = "loss~" + all_columns

# mod = ols(formula=my_formula,
#                 data=sample_df, family=sm.families.Gaussian()).fit()
                
# aov_table = sm.stats.anova_lm(mod, typ=2)
# print(aov_table)

In [14]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

x = sample_df.drop(['loss'], axis=1)
y = sample_df.loss

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x_train)

In [22]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb 


# Best params for xgboost from
# params = {
#                        "learning_rate":[0.003, 0.008],
#                        "subsample":[0.84],
#                        'booster': ['gbtree'],
#                        'tree_method': ['gpu_hist'],
#                        'colsample_bytree':[0.70],
#     'max_depth': [7],
#     'n_estimators': [500, 1000, 2500],
# }


# {'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.003, 
# 'max_depth': 7, 'n_estimators': 2500, 'subsample': 0.84, 'tree_method': 'gpu_hist'} ------ -7.915772914886475



params = {
                       "learning_rate":[0.003, 0.008],
                       "subsample":[0.84],
                       'booster': ['gbtree'],
                       'tree_method': ['gpu_hist'],
                       'colsample_bytree':[0.70],
    'max_depth': [7],
    'n_estimators': [2500],
}

xgb_estimator = xgb.XGBRegressor(random_state=42)
grid = GridSearchCV(xgb_estimator, param_grid=params, scoring='neg_root_mean_squared_error', cv=5, verbose=100)
xgb_model = grid.fit(x_scaled, y_train)

print(xgb_model.best_params_, xgb_model.best_score_)


In [None]:
xgb_estimator = xgb.XGBRegressor(random_state=42, booster='gbtree', colsample_bytree= 0.7, learning_rate= 0.003, max_depth=7, n_estimators=2500, subsample= 0.84, tree_method= 'gpu_hist')
xgb_estimator.fit(x_train, y_train)
oof_pred1 = xgb_estimator.predict(x_test)
oof_pred1 = np.clip(oof_pred1, y.min(), y.max())

print(f'Mean Error: {np.sqrt(mean_squared_error(y_test, oof_pred1))}')

In [23]:
# Best Params from 
# params = {
#     'num_leaves': [28, 31, 50, 75],
#     'learning_rate': [0.003],
#     'max_depth': [-1, 3, 5],
#     'n_estimators': [500, 1000],
# }

# 'learning_rate': 0.003, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 50 ----- -7.934798408422064



params = {
    'num_leaves': [50],
    'learning_rate': [0.003],
    'max_depth': [-1],
    'n_estimators': [2500],
}

lgb_estimator = lgb.LGBMRegressor(random_state=42)

grid = GridSearchCV(lgb_estimator, param_grid=params, scoring='neg_root_mean_squared_error', cv=5, verbose=100)
lgb_model = grid.fit(x_scaled, y_train)

print(lgb_model.best_params_, lgb_model.best_score_)


In [24]:
oof_pred1 = lgb_model.predict(x_test)
oof_pred1 = np.clip(oof_pred1, y.min(), y.max())

print(f'Mean Error: {np.sqrt(mean_squared_error(y_test, oof_pred1))}')