# This notebook is for the 30 Days of Machine Learning contest on Kaggle.

#### This notebook was created by following Abhishek Thakur's YouTube video tutorial: 
https://www.youtube.com/watch?v=2Yx2Y545yBk&list=PL98nY_tJQXZnP-k3qCDd1hljVSciDV9_N&index=21




In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [11]:
df = pd.read_csv("data/train_folds.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain = df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:,f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc{col}"] = temp_test_feat
    df = pd.concat(temp_df)


In [13]:
df_test

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,tar_enccat0,tar_enccat1,tar_enccat2,tar_enccat3,tar_enccat4,tar_enccat5,tar_enccat6,tar_enccat7,tar_enccat8,tar_enccat9
0,B,B,B,C,B,B,A,E,E,I,...,8.246840,8.204026,8.224860,8.237026,8.240946,8.230315,8.241024,8.240630,8.193335,8.223861
1,A,B,A,C,B,C,A,E,C,H,...,8.239289,8.204026,8.245088,8.237026,8.240946,8.268626,8.241024,8.240630,8.282179,8.240706
2,B,A,A,A,B,B,A,E,D,K,...,8.246840,8.277566,8.245088,8.276705,8.240946,8.230315,8.241024,8.240630,8.242734,8.269896
3,B,B,A,C,B,D,A,E,A,N,...,8.246840,8.204026,8.245088,8.237026,8.240946,8.251199,8.241024,8.240630,8.230478,8.249964
4,B,B,A,C,B,C,A,E,C,F,...,8.246840,8.204026,8.245088,8.237026,8.240946,8.268626,8.241024,8.240630,8.282179,8.259044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,B,A,A,C,B,D,A,E,E,I,...,8.246840,8.277566,8.245088,8.237026,8.240946,8.251199,8.241024,8.240630,8.193335,8.223861
199996,B,A,A,C,B,B,A,E,C,F,...,8.246840,8.277566,8.245088,8.237026,8.240946,8.230315,8.241024,8.240630,8.282179,8.259044
199997,A,B,B,C,B,B,A,E,C,I,...,8.239289,8.204026,8.224860,8.237026,8.240946,8.230315,8.241024,8.240630,8.282179,8.223861
199998,A,A,A,C,B,D,A,D,A,F,...,8.239289,8.277566,8.245088,8.237026,8.240946,8.251199,8.241024,8.254308,8.230478,8.259044


In [None]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
   
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))