In [12]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [3]:
df = pd.read_csv("../data/process/train_folds.csv")
df_test = pd.read_csv("../data/original/test.csv")

df1 = pd.read_csv("../data/process/train_pred_1.csv")
df1.columns = ["id", "pred_1"]
df2 = pd.read_csv("../data/process/train_pred_2.csv")
df2.columns = ["id", "pred_2"]
df3 = pd.read_csv("../data/process/train_pred_3.csv")
df3.columns = ["id", "pred_3"]
df4 = pd.read_csv("../data/process/train_pred_04.csv")
df4.columns = ["id", "pred_4"]



df_test1 = pd.read_csv("../data/process/test_pred_1.csv")
df_test1.columns = ["id", "pred_1"]
df_test2 = pd.read_csv("../data/process/test_pred_2.csv")
df_test2.columns = ["id", "pred_2"]
df_test3 = pd.read_csv("../data/process/test_pred_3.csv")
df_test3.columns = ["id", "pred_3"]
df_test4 = pd.read_csv("../data/process/test_pred_04.csv")
df_test4.columns = ["id", "pred_4"]


df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")


df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")


In [4]:
sample_submission = pd.read_csv("../data/original/sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    

    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 7000,
        'learning_rate': 0.03,
        'max_depth': 2
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("level1_train_pred_1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("level1_test_pred_1.csv", index=False)

[0]	validation_0-rmse:7.54852
[579]	validation_0-rmse:0.70540
0 0.705357123580763
[0]	validation_0-rmse:7.54523
[542]	validation_0-rmse:0.70683
1 0.706789389937869
[0]	validation_0-rmse:7.54305
[536]	validation_0-rmse:0.70916
2 0.7090636927029722
[0]	validation_0-rmse:7.54532
[574]	validation_0-rmse:0.70807
3 0.7080372539952124
[0]	validation_0-rmse:7.55072
[664]	validation_0-rmse:0.70701
4 0.706962676406288
0.7072420273246209 0.0012479286735525082


In [10]:
sample_submission = pd.read_csv("../data/original/sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    lgbm_parameters = {
        'metric': 'RMSE',
        'feature_pre_filter': False,
        'lambda_l1': 0.45,
        'lambda_l2': 4.8,
        'learning_rate': 0.005,
        'num_trees': 80000,
        'num_leaves': 10, 
        'feature_fraction': 0.4, 
        'bagging_fraction': 1.0, 
        'bagging_freq': 0, 
        'min_child_samples': 100,
        'num_threads': 4
    }
    
    lgbm_model = LGBMRegressor(**lgbm_parameters)
    lgbm_model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = lgbm_model.predict(xvalid)
    test_preds = lgbm_model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("level1_train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("level1_test_pred_2.csv", index=False)



Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.707004
[2000]	valid_0's rmse: 0.705975
[3000]	valid_0's rmse: 0.705842
Early stopping, best iteration is:
[3140]	valid_0's rmse: 0.705839
0 0.7058385742018889
Training until validation scores don't improve for 300 rounds




[1000]	valid_0's rmse: 0.70832
[2000]	valid_0's rmse: 0.707469
[3000]	valid_0's rmse: 0.707332
[4000]	valid_0's rmse: 0.707276
Early stopping, best iteration is:
[4681]	valid_0's rmse: 0.707264
1 0.7072639211644963




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.710499
[2000]	valid_0's rmse: 0.709682
[3000]	valid_0's rmse: 0.709513
Early stopping, best iteration is:
[3632]	valid_0's rmse: 0.709487
2 0.7094872447958707




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.709534
[2000]	valid_0's rmse: 0.708799
[3000]	valid_0's rmse: 0.70866
[4000]	valid_0's rmse: 0.708635
Early stopping, best iteration is:
[4436]	valid_0's rmse: 0.708632
3 0.7086318851867455
Training until validation scores don't improve for 300 rounds




[1000]	valid_0's rmse: 0.708505
[2000]	valid_0's rmse: 0.707773
[3000]	valid_0's rmse: 0.707718
Early stopping, best iteration is:
[2773]	valid_0's rmse: 0.707712
4 0.7077123779376696
0.7077868006573341 0.0012393713049012346


In [14]:
sample_submission = pd.read_csv("../data/original/sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    model = CatBoostRegressor(
              loss_function='RMSE'
            )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("level1_train_pred_3.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("level1_test_pred_3.csv", index=False)

Learning rate set to 0.103533
0:	learn: 0.7398505	total: 76.2ms	remaining: 1m 16s
1:	learn: 0.7341532	total: 84.3ms	remaining: 42.1s
2:	learn: 0.7295003	total: 93.8ms	remaining: 31.2s
3:	learn: 0.7255111	total: 102ms	remaining: 25.4s
4:	learn: 0.7223457	total: 111ms	remaining: 22s
5:	learn: 0.7197792	total: 119ms	remaining: 19.7s
6:	learn: 0.7176539	total: 129ms	remaining: 18.2s
7:	learn: 0.7158306	total: 138ms	remaining: 17.1s
8:	learn: 0.7143410	total: 147ms	remaining: 16.2s
9:	learn: 0.7131856	total: 155ms	remaining: 15.4s
10:	learn: 0.7122610	total: 165ms	remaining: 14.9s
11:	learn: 0.7114705	total: 174ms	remaining: 14.3s
12:	learn: 0.7108058	total: 182ms	remaining: 13.8s
13:	learn: 0.7103443	total: 190ms	remaining: 13.4s
14:	learn: 0.7098901	total: 199ms	remaining: 13.1s
15:	learn: 0.7095214	total: 207ms	remaining: 12.8s
16:	learn: 0.7091819	total: 216ms	remaining: 12.5s
17:	learn: 0.7089297	total: 224ms	remaining: 12.2s
18:	learn: 0.7087123	total: 233ms	remaining: 12s
19:	learn: 

In [15]:
df = pd.read_csv("../data/process/train_folds.csv")
df_test = pd.read_csv("../data/original/test.csv")
sample_submission = pd.read_csv("../data/original/sample_submission.csv")

df1 = pd.read_csv("level1_train_pred_1.csv")
df2 = pd.read_csv("level1_train_pred_2.csv")
df3 = pd.read_csv("level1_train_pred_3.csv")

df_test1 = pd.read_csv("level1_test_pred_1.csv")
df_test2 = pd.read_csv("level1_test_pred_2.csv")
df_test3 = pd.read_csv("level1_test_pred_3.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2,pred_3
0,1,B,B,B,C,B,B,A,E,C,...,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,0,8.44991,8.453413,8.472878
1,2,B,B,A,A,B,D,A,F,A,...,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2,8.442258,8.420852,8.381954
2,3,A,A,A,C,B,D,A,D,A,...,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4,8.237483,8.203854,8.240232
3,4,B,B,A,C,B,D,A,E,C,...,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.372747,8.356375,8.390851
4,6,A,A,A,C,B,D,A,E,A,...,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1,8.258714,8.295205,8.263666


In [16]:
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.705341295880272
1 0.7067750275564478
2 0.7090726817516138
3 0.7080476390395578
4 0.7069672470815735
0.707240778261893 0.0012575111176051966


In [17]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("../data/submit/014_submission.csv", index=False)

0.71736