In [1]:
import numpy as np
import pandas as pd
import scipy as sp

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import minmax_scaling

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import *

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("../data/process/train_folds.csv")
test = pd.read_csv("../data/original/test.csv")
sample_submission = pd.read_csv("../data/original/sample_submission.csv")

In [47]:
useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
cat_features = ["cat" + str(i) for i in range(10)]
num_features = ["cont" + str(i) for i in range(14)]
test = test[useful_features]

In [5]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [8]:
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[cat_features] = ordinal_encoder.fit_transform(xtrain[cat_features])
    xvalid[cat_features] = ordinal_encoder.transform(xvalid[cat_features])
    xtest[cat_features] = ordinal_encoder.transform(xtest[cat_features])
    
    params = {
    'random_state': 1, 
    'n_jobs': 4,
    'booster': 'gbtree',
    'n_estimators': 10000,
    'learning_rate': 0.03628302216953097,
    'reg_lambda': 0.0008746338866473539,
    'reg_alpha': 23.13181079976304,
    'subsample': 0.7875490025178415,
    'colsample_bytree': 0.11807135201147481,
    'max_depth': 3
    }
    
    model = XGBRegressor(
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

[0]	validation_0-rmse:7.50025
[1000]	validation_0-rmse:0.72365
[2000]	validation_0-rmse:0.71926
[3000]	validation_0-rmse:0.71740
[4000]	validation_0-rmse:0.71647
[5000]	validation_0-rmse:0.71602
[6000]	validation_0-rmse:0.71583
[7000]	validation_0-rmse:0.71568
[8000]	validation_0-rmse:0.71563
[8109]	validation_0-rmse:0.71564
0 0.7156123450764549
[0]	validation_0-rmse:7.49701
[1000]	validation_0-rmse:0.72328
[2000]	validation_0-rmse:0.71893
[3000]	validation_0-rmse:0.71723
[4000]	validation_0-rmse:0.71645
[5000]	validation_0-rmse:0.71609
[6000]	validation_0-rmse:0.71588
[7000]	validation_0-rmse:0.71581
[7089]	validation_0-rmse:0.71581
1 0.715790955040373
[0]	validation_0-rmse:7.49484
[1000]	validation_0-rmse:0.72505
[2000]	validation_0-rmse:0.72076
[3000]	validation_0-rmse:0.71905
[4000]	validation_0-rmse:0.71824
[5000]	validation_0-rmse:0.71788
[6000]	validation_0-rmse:0.71769
[6830]	validation_0-rmse:0.71767
2 0.7176511340540268
[0]	validation_0-rmse:7.49708
[1000]	validation_0-rmse:0

In [9]:
print(np.mean(scores), np.std(scores))

0.7164658378693716 0.0009016909104135961


In [10]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()

In [12]:
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("../data/process/train_pred_1.csv", index=False)

In [14]:
np.column_stack(final_test_predictions)

array([[8.095299 , 8.078904 , 8.078672 , 8.071487 , 8.073331 ],
       [8.393495 , 8.419998 , 8.353354 , 8.403889 , 8.411974 ],
       [8.42839  , 8.41259  , 8.448543 , 8.435491 , 8.402265 ],
       ...,
       [8.524944 , 8.4925   , 8.551234 , 8.53426  , 8.509216 ],
       [8.20727  , 8.168365 , 8.14348  , 8.136907 , 8.193927 ],
       [7.9834924, 7.9846087, 7.9937625, 7.98233  , 7.9678774]],
      dtype=float32)

In [16]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("../data/process/test_pred_1.csv", index=False)

In [28]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [29]:
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[cat_features] = ordinal_encoder.fit_transform(xtrain[cat_features])
    xvalid[cat_features] = ordinal_encoder.transform(xvalid[cat_features])
    xtest[cat_features] = ordinal_encoder.transform(xtest[cat_features])
    
    
    rg = Ridge(alpha=0.1, random_state=fold)
    rg.fit(xtrain, ytrain)
    preds_valid = rg.predict(xvalid)
    test_preds = rg.predict(xtest)
    preds_valid = preds_valid.astype(float) 
    test_preds = test_preds.astype(float)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

0 0.7384533951464144
1 0.7383790351755226
2 0.7398428330399516
3 0.7394394194738504
4 0.7399615707462769


In [30]:
print(np.mean(scores), np.std(scores))

0.7392152507164031 0.0006753939786479647


In [31]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()

In [32]:
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("../data/process/train_pred_2.csv", index=False)

In [33]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("../data/process/test_pred_2.csv", index=False)

In [34]:
df1 = pd.read_csv("../data/process/train_pred_1.csv")
df2 = pd.read_csv("../data/process/train_pred_2.csv")

In [35]:
df_test1 = pd.read_csv("../data/process/test_pred_1.csv")
df_test2 = pd.read_csv("../data/process/test_pred_2.csv")

In [37]:
df = pd.read_csv("../data/process/train_folds.csv")
df_test = pd.read_csv("../data/original/test.csv")
sample_submission = pd.read_csv("../data/original/sample_submission.csv")

In [38]:
df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")

In [57]:
df_test = pd.read_csv("../data/original/test.csv")
df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")

In [41]:

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2
0,1,B,B,B,C,B,B,A,E,C,N,0.20147,-0.014822,0.669699,0.136278,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,0,8.447721,8.183844
1,2,B,B,A,A,B,D,A,F,A,O,0.743068,0.367411,1.021605,0.365798,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2,8.43457,8.479641
2,3,A,A,A,C,B,D,A,D,A,F,0.742708,0.310383,-0.012673,0.576957,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4,8.219918,8.180162
3,4,B,B,A,C,B,D,A,E,C,K,0.429551,0.620998,0.577942,0.28061,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.353198,8.272754
4,6,A,A,A,C,B,D,A,E,A,N,1.058291,0.367492,-0.052389,0.232407,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1,8.258547,8.318773


In [43]:
useful_features = ["pred_1", "pred_2"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearSVR()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7290179346727984
1 0.7175876948162978
2 0.7357820739193233
3 0.7265694654255356
4 0.7182825654418986
0.7254479468551708 0.006839927660611691


In [44]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("../data/submit/011_submission.csv", index=False)

score:0.71897

In [45]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [48]:
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[cat_features] = ordinal_encoder.fit_transform(xtrain[cat_features])
    xvalid[cat_features] = ordinal_encoder.transform(xvalid[cat_features])
    xtest[cat_features] = ordinal_encoder.transform(xtest[cat_features])
    
    
    model_rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
    model_rf.fit(xtrain, ytrain)
    preds_valid = model_rf.predict(xvalid)
    test_preds = model_rf.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

0 0.7317808745473757
1 0.7317983688303781
2 0.7335682382443652
3 0.7332092605042874
4 0.7325796284821621


In [49]:
print(np.mean(scores), np.std(scores))

0.7325872741217136 0.0007241364454342191


In [50]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()

In [51]:
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("../data/process/train_pred_3.csv", index=False)

In [52]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("../data/process/test_pred_3.csv", index=False)

In [53]:
df3 = pd.read_csv("../data/process/train_pred_3.csv")

In [54]:
df_test3 = pd.read_csv("../data/process/test_pred_3.csv")

In [55]:
df = df.merge(df3, on="id", how="left")

In [58]:
df_test = df_test.merge(df_test3, on="id", how="left")

In [59]:
useful_features = ["pred_1", "pred_2","pred_3"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearSVR()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.716534881671988
1 0.7216822067452666
2 0.7178065957574278
3 0.71883234545614
4 0.7303175396343827
0.7210347138530411 0.004941857225472209


In [60]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("../data/submit/012_submission.csv", index=False)

In [64]:
sample_submission = sample_submission.rename(columns={"pred_3":"target"})

In [65]:
sample_submission.to_csv("../data/submit/012_submission.csv", index=False)