# Kaggle Feb 2021 Tabular Playground

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import model_selection

In [2]:
pd.set_option('display.max_rows', 1000, 'display.max_columns', 1000)

In [3]:
def split_vals(a,n):
    return a[:n].copy(), a[n:].copy()

In [4]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [5]:
def print_score(m):
    rmse_train = rmse(m.predict(X_train), y_train)
    rmse_valid = rmse(m.predict(X_valid), y_valid)
    rsquared_train = m.score(X_train, y_train)
    rsquared_valid = m.score(X_valid, y_valid)
    print(f'rmse_train: {rmse_train}, rmse_valid: {rmse_valid}, rsquared_train: {rsquared_train}, rsquared_valid: {rsquared_valid}')

In [6]:
path = '/home/cho/kaggle_feb21_tabular/input'

In [7]:
df_raw = pd.read_csv(f'{path}/train.csv')

In [8]:
df_test = pd.read_csv(f'{path}/test.csv')

In [9]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  object 
 2   cat1    300000 non-null  object 
 3   cat2    300000 non-null  object 
 4   cat3    300000 non-null  object 
 5   cat4    300000 non-null  object 
 6   cat5    300000 non-null  object 
 7   cat6    300000 non-null  object 
 8   cat7    300000 non-null  object 
 9   cat8    300000 non-null  object 
 10  cat9    300000 non-null  object 
 11  cont0   300000 non-null  float64
 12  cont1   300000 non-null  float64
 13  cont2   300000 non-null  float64
 14  cont3   300000 non-null  float64
 15  cont4   300000 non-null  float64
 16  cont5   300000 non-null  float64
 17  cont6   300000 non-null  float64
 18  cont7   300000 non-null  float64
 19  cont8   300000 non-null  float64
 20  cont9   300000 non-null  float64
 21  cont10  30

In [10]:
df_raw.shape

(300000, 26)

In [20]:
df_raw.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,I,0.923191,0.684968,0.124454,0.217886,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,F,0.437627,0.014213,0.357438,0.846127,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,N,0.732209,0.760122,0.454644,0.81299,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,K,0.705142,0.771678,0.153735,0.732893,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,F,0.486063,0.639349,0.496212,0.354186,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


## Giving categorical variables numerical values

In [10]:
df_proc = df_raw.copy()

In [11]:
le = preprocessing.LabelEncoder()

In [12]:
obj_cols = df_proc.select_dtypes('object').columns

Replace all of the categorical variables with the label encoder using a loop

In [13]:
for col in obj_cols:
    df_proc.loc[:,col] = le.fit_transform(df_raw[col].values)

In [14]:
df_proc.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,0,1,0,0,1,3,0,4,2,8,0.923191,0.684968,0.124454,0.217886,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,1,0,0,0,1,1,0,4,0,5,0.437627,0.014213,0.357438,0.846127,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,0,0,0,2,1,3,0,1,2,13,0.732209,0.760122,0.454644,0.81299,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,0,0,0,2,1,3,0,4,6,10,0.705142,0.771678,0.153735,0.732893,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,0,1,0,0,1,1,0,4,2,5,0.486063,0.639349,0.496212,0.354186,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


## Cross-validation 

In [40]:
if __name__ == "__main__":
    df_shuf = df_proc.sample(frac=1).reset_index(drop=True)
    kf = model_selection.KFold(n_splits=5)
    for fold, (trn_, val_) in enumerate(kf.split(X = df_shuf)):
        df_shuf.loc[val_, 'kfold'] = fold
    df_shuf.to_csv(f'{path}/train_folds.csv', index=False)

In [54]:
df_train_folds = pd.read_csv(f'{path}/train_folds.csv')

## Random Forest Regressor

We will split the training dataset to 80% for training and 20% for validation set.

In [47]:
n_valid = int(df_raw.shape[0] * 0.2)
n_trn = df_raw.shape[0] - n_valid

We randomly shuffle the rows of the entire raw dataset.

In [48]:
df_shuf = df_proc.sample(frac=1)

In [49]:
df_trn = df_shuf.drop(columns='target')
y_trn = df_shuf['target']

In [50]:
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_shuf, n_trn)

In [20]:
rfr = RandomForestRegressor(n_estimators=100, n_jobs=1)
rfr.fit(X_train, y_train)
print_score(rfr)
# rmse_train: 0.3218218384322269, rmse_valid: 0.8588008298230758, rsquared_train: 0.8684371107468682, rsquared_valid: 0.06349986164922261

RandomForestRegressor(n_jobs=1)