In [14]:
import numpy as np
import pandas as pd
import scipy as sp

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import minmax_scaling

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import *

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("../data/process/train_folds.csv")
test = pd.read_csv("../data/original/test.csv")
sample_submission = pd.read_csv("../data/original/sample_submission.csv")

In [3]:
useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
cat_features = ["cat" + str(i) for i in range(10)]
num_features = ["cont" + str(i) for i in range(14)]
test = test[useful_features]

In [4]:
scaler = StandardScaler()
scaler.fit(train[num_features])
train[num_features] = scaler.transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

In [5]:
cat6_category = list(pd.DataFrame(train['cat6'].value_counts()/len(train['cat6'])*100)[:1].index)
cat7_category = list(pd.DataFrame(train['cat7'].value_counts()/len(train['cat7'])*100)[:3].index)
cat8_category = list(pd.DataFrame(train['cat8'].value_counts()/len(train['cat8'])*100)[:4].index)
train['cat6'] = np.where(~train['cat6'].isin(cat6_category), 'Others', train['cat6'])
train['cat7'] = np.where(~train['cat7'].isin(cat7_category), 'Others', train['cat7'])
train['cat8'] = np.where(~train['cat8'].isin(cat8_category), 'Others', train['cat8'])

In [6]:
cat6_category = list(pd.DataFrame(test['cat6'].value_counts()/len(test['cat6'])*100)[:1].index)
cat7_category = list(pd.DataFrame(test['cat7'].value_counts()/len(test['cat7'])*100)[:3].index)
cat8_category = list(pd.DataFrame(test['cat8'].value_counts()/len(test['cat8'])*100)[:4].index)
test['cat6'] = np.where(~test['cat6'].isin(cat6_category), 'Others', test['cat6'])
test['cat7'] = np.where(~test['cat7'].isin(cat7_category), 'Others', test['cat7'])
test['cat8'] = np.where(~test['cat8'].isin(cat8_category), 'Others', test['cat8'])

In [7]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [8]:
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[cat_features] = ordinal_encoder.fit_transform(xtrain[cat_features])
    xvalid[cat_features] = ordinal_encoder.transform(xvalid[cat_features])
    xtest[cat_features] = ordinal_encoder.transform(xtest[cat_features])
    
    
    rg = Ridge(alpha=0.0001, random_state=fold)
    rg.fit(xtrain, ytrain)
    preds_valid = rg.predict(xvalid)
    test_preds = rg.predict(xtest)
    preds_valid = preds_valid.astype(float) 
    test_preds = test_preds.astype(float)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

0 0.7384515248967047
1 0.7383857651709933
2 0.7398499594105796
3 0.7394150931801781
4 0.7399600533657644


In [9]:
print(np.mean(scores), np.std(scores))

0.739212479204844 0.000673618306553895


In [10]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()

In [11]:
final_valid_predictions.columns = ["id", "pred_02"]
final_valid_predictions.to_csv("../data/process/train_pred_02.csv", index=False)

In [12]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_02"]
sample_submission.to_csv("../data/process/test_pred_02.csv", index=False)

In [17]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [18]:
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[cat_features] = ordinal_encoder.fit_transform(xtrain[cat_features])
    xvalid[cat_features] = ordinal_encoder.transform(xvalid[cat_features])
    xtest[cat_features] = ordinal_encoder.transform(xtest[cat_features])
    
    
    cat_parameters = {    
        'iterations':1600,
        'learning_rate':0.024,
        'l2_leaf_reg':20,
        'random_strength':1.5,
        'grow_policy':'Depthwise',
        'leaf_estimation_method':'Newton', 
        'bootstrap_type':'Bernoulli',
        'thread_count':4,
        'verbose':False,
        'loss_function':'RMSE',
        'eval_metric':'RMSE',
        'od_type':'Iter'
    }
    
    cat_model = CatBoostRegressor(**cat_parameters)
    cat_model.fit(xtrain, ytrain, verbose =200) 
    preds_valid = cat_model.predict(xvalid)
    test_preds = cat_model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

0:	learn: 0.7464892	total: 36.4ms	remaining: 58.3s
200:	learn: 0.7309394	total: 7.23s	remaining: 50.3s
400:	learn: 0.7260354	total: 14.2s	remaining: 42.6s
600:	learn: 0.7213585	total: 20.7s	remaining: 34.4s
800:	learn: 0.7155021	total: 27s	remaining: 26.9s
1000:	learn: 0.7108009	total: 33.4s	remaining: 20s
1200:	learn: 0.7068680	total: 39.7s	remaining: 13.2s
1400:	learn: 0.7030872	total: 46s	remaining: 6.54s
1599:	learn: 0.6996998	total: 52.4s	remaining: 0us
0 0.7201144161227565
0:	learn: 0.7464729	total: 35.3ms	remaining: 56.5s
200:	learn: 0.7311171	total: 6.84s	remaining: 47.6s
400:	learn: 0.7262839	total: 13.5s	remaining: 40.3s
600:	learn: 0.7214819	total: 20s	remaining: 33.2s
800:	learn: 0.7157106	total: 26.4s	remaining: 26.4s
1000:	learn: 0.7109246	total: 33.3s	remaining: 19.9s
1200:	learn: 0.7069347	total: 40.6s	remaining: 13.5s
1400:	learn: 0.7032317	total: 47.2s	remaining: 6.7s
1599:	learn: 0.6998708	total: 53.5s	remaining: 0us
1 0.7199595211881428
0:	learn: 0.7462026	total: 36

In [19]:
print(np.mean(scores), np.std(scores))

0.7207707189759025 0.0007364465248789974


In [20]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_05"]
final_valid_predictions.to_csv("../data/process/train_pred_05.csv", index=False)

In [21]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_05"]
sample_submission.to_csv("../data/process/test_pred_05.csv", index=False)