In [None]:
import numpy as np
import pandas as pd
from math import sqrt

from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from xgboost import XGBRegressor


# NOTE: Make sure that the outcome column is labeled 'target' in the data file
train = pd.read_csv('data/train_updated.csv')
test = pd.read_csv('data/test_updated.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [None]:
train_data, train_target = train.drop(['INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.copy()

KeyError: "['index'] not found in axis"

In [None]:
data = pd.concat([train_data, test_data])
data

In [None]:
le = LabelEncoder().fit(data['SEND_격자공간명'].append(data['REC_격자공간명']).sort_values())
data['SEND_격자공간명'] = le.transform(data['SEND_격자공간명'])
data['REC_격자공간명'] = le.transform(data['REC_격자공간명'])

data_one = pd.get_dummies(data)

cols = data_one.columns
for col in cols:
    if data_one[col].mean() > 100:
        mm = MinMaxScaler()
        data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

In [None]:
train_data

In [None]:
test_data

In [None]:
# Average CV score on the training set was: -5.2776756339554325
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=3, min_child_weight=17, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.6, verbosity=0)),
    StackingEstimator(estimator=AdaBoostRegressor(learning_rate=1.0, loss="linear", n_estimators=100)),
    RobustScaler(),
    RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=7, min_samples_split=18, n_estimators=100)
)

# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

In [None]:
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=42)
score = 0
for i, (train_index, test_index) in enumerate(kf.split(train_data)):

    X_train, X_test = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_test = train_target.iloc[train_index], train_target.iloc[test_index]

    exported_pipeline.fit(X_train, y_train)
    
    y_pred = exported_pipeline.predict(X_test)
    print('Fold {} rmse {}'.format(i+1, sqrt(mean_squared_error(y_pred, y_test))))   
    score += sqrt(mean_squared_error(y_pred, y_test))/folds

print('Avg rmse', score)

In [None]:
# 5.2537
# exported_pipeline = make_pipeline(
#     StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=3, min_child_weight=17, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.6, verbosity=0)),
#     StackingEstimator(estimator=AdaBoostRegressor(learning_rate=1.0, loss="linear", n_estimators=100)),
#     # StackingEstimator(estimator=LGBMRegressor()),
#     RobustScaler(),
#     RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=7, min_samples_split=18, n_estimators=100)
# )

In [None]:
exported_pipeline.fit(train_data, train_target)
prediction = exported_pipeline.predict(test_data)

In [None]:
submission['INVC_CONT'] = prediction
submission.to_csv('submission/final_pred_ext.csv', index=False)