In [1]:
import numpy as np
import pandas as pd

# EDA
import matplotlib.pyplot as plt
import seaborn as sns

# Learning algorithms
import sklearn
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import Processor_cat
import Processor_xgb
import Processor_lgbm
import Processor_rand

import warnings
warnings.filterwarnings('ignore')

import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

오늘 날짜 : 20210614


In [2]:
preprocessor_cat = Processor_cat.Preprocessor('../data/train.csv', '../data/test.csv')
preprocessor_xgb = Processor_xgb.Preprocessor('../data/train.csv', '../data/test.csv')
preprocessor_lgbm = Processor_lgbm.Preprocessor('../data/train.csv', '../data/test.csv')
preprocessor_rand = Processor_rand.Preprocessor('../data/train.csv', '../data/test.csv')


X_trains_cat, y_trains_cat = preprocessor_cat.train_preprocess()
X_trains_xgb, y_trains_xgb = preprocessor_xgb.train_preprocess()
X_trains_lgbm, y_trains_lgbm = preprocessor_lgbm.train_preprocess()
X_trains_rand, y_trains_rand = preprocessor_rand.train_preprocess()

X_tests_cat = preprocessor_cat.test_preprocess()
X_tests_xgb = preprocessor_xgb.test_preprocess()
X_tests_lgbm = preprocessor_lgbm.test_preprocess()
X_tests_rand = preprocessor_rand.test_preprocess()

In [11]:
n_folds = 10
cat_models = [[CatBoostRegressor() for i in range(60)] for j in range(n_folds)]
xgb_models = [[XGBRegressor() for i in range(60)] for j in range(n_folds)]
lgb_models = [[LGBMRegressor() for i in range(60)] for j in range(n_folds)]
rfr_models = [[RandomForestRegressor() for i in range(60)] for j in range(n_folds)]

cat_cv = Processor_cat.CV_sklearn(cat_models, n_folds = n_folds)
xgb_cv = Processor_xgb.CV_sklearn(xgb_models, n_folds = n_folds)
lgb_cv = Processor_lgbm.CV_sklearn(lgb_models, n_folds = n_folds)
rfr_cv = Processor_rand.CV_sklearn(rfr_models, n_folds = n_folds)

scores_lr_cat = cat_cv.train(X_trains_cat, y_trains_cat, verbose = 1)
scores_lr_xgb = xgb_cv.train(X_trains_xgb, y_trains_xgb, verbose = 1)
scores_lr_lgb = lgb_cv.train(X_trains_lgbm, y_trains_lgbm, verbose = 1)
scores_lr_rfr = rfr_cv.train(X_trains_rand, y_trains_rand, verbose = 1)

print(np.mean(scores_lr_cat))
print(np.mean(scores_lr_xgb))
print(np.mean(scores_lr_lgb))
print(np.mean(scores_lr_rfr))

5th model complete
10th model complete
15th model complete
20th model complete
25th model complete
30th model complete
35th model complete
40th model complete
45th model complete
50th model complete
55th model complete
60th model complete
3.1035224524140657


In [12]:
for idx, test in enumerate(X_tests_cat):
    X_tests_cat[idx] = test.interpolate()
    
for idx, test in enumerate(X_tests_xgb):
    X_tests_xgb[idx] = test.interpolate()

for idx, test in enumerate(X_tests_lgbm):
    X_tests_lgbm[idx] = test.interpolate()

for idx, test in enumerate(X_tests_rand):
    X_tests_rand[idx] = test.interpolate()

In [13]:
test_pred = cat_cv.predict(X_tests_cat)
test_pred += xgb_cv.predict(X_tests_xgb)
test_pred += lgb_cv.predict(X_tests_lgbm)
test_pred += rfr_cv.predict(X_tests_rand)

In [16]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['answer'] = test_pred/4

submission.to_csv('../submission/'+today+'_ensenble2.csv', index =False)

In [None]:
RF_params = {
    'n_estima'
}