In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import prepro_util
import os

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

import catboost
from catboost import CatBoostRegressor
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

In [4]:
### read train data
train = pd.read_csv('../../../../Desktop/wids/train_data.csv')

In [2]:
### read test data
test = pd.read_csv('../../../../Desktop/wids/test_data.csv')

### preprocessing

In [5]:
### target column
target = 'contest-tmp2m-14d__tmp2m'

In [6]:
### preprocess train data
pre_train = prepro_util.preprocess_data(train , 4 , "mean" , target)

index and datetime set
categorical data encoded
location data handled
datetime handled


## PCA

In [13]:
PCA_pre_train = prepro_util.PCA_transform(pre_train , 0.95 , target)

In [None]:
### split the data
X = pre_train[[col for col in pre_train.columns if col != target]]
y = pre_train[target]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

## XGBOOST

In [None]:
### train the model - XGBoost
model_xgb = xgb.XGBRegressor(booster = 'gbtree',
                             subsample = 0.8,
                             eta = 0.1, 
                             n_estimaters = 15000,
                             colsample_bytree = 0.4,
                             max_depth = 5,
                             tree_method = 'hist',
                             eval_metric = 'rmse', 
                             objective = 'reg:squarederror')

model_xgb.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)], verbose = 100)

In [None]:
### use RMSE to evaluate
y_pred_xgb = model_xgb.predict(x_test)
mse = mean_squared_error(y_pred_xgb, y_test)

print("XGboost MSE : " ,mse)


try:
    os.makedirs('models')
except:
    pass

### save model
model_xgb.save_model("./models/PCA_95_xgb_md_5.json")

## CAT

In [None]:
### train the model - CatBoost
model_cat = CatBoostRegressor(n_estimators = 15000,
                              eval_metric = 'RMSE',
                              learning_rate = 0.1, 
                              verbose = 1,
                              random_seed = 0).fit(x_train, y_train)

model_cat.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)], verbose = 50)

### save model
model_cat.save_model("./models/PCA_95_cat.json")

In [None]:
### use RMSE to evaluate
y_pred_cat = model_cat.predict(x_test)
mean_squared_error(y_pred_cat, y_test)

print("CATboost MSE : ", mean_squared_error)

In [None]:
### ensemble the results
result = model_xgb.predict(X_test) * 0.2 + model_cat.predict(X_test) * 0.8
result

## Predict

In [65]:
### test data
pre_test = prepro_util.preprocess_data(test , 4 , "mean" , target)

### PCA
n_components = 49
component_name_list = []
for i in range(1 , n_components + 1):
    component_name_list.append(f"component {i}")
    
pca = PCA(n_components = 49)
PCA_pre_test = pca.fit_transform(pre_test)
PCA_pre_test = pd.DataFrame(PCA_pre_test , columns = component_name_list)

index and datetime set
categorical data encoded
location data handled
datetime handled


### Models import

In [62]:
### XGboost
xgb_model = xgb.Booster()
xgb_model.load_model("./models/PCA_95_xgb_md_5.json")

### CATboost
cat_model = catboost.CatBoostClassifier()
cat_model = cat_model.load_model("./models/PCA_95_cat.json")

In [66]:
### Xgboost calculation
test_dataset = xgb.DMatrix(PCA_pre_test)
xgb_predict_y = xgb_model.predict(test_dataset)

In [67]:
### CATboost calculation
cat_predict_y = cat_model.predict(PCA_pre_test , prediction_type = 'RawFormulaVal')

In [75]:
### combine 
combine = xgb_predict_y

In [76]:
result = test.copy()
result[target] = combine
result = result.reset_index()

result = pd.DataFrame(result[['index' , target]] , columns = ['index' , target])

In [77]:
### create directory
try:
    os.makedirs("output")
except:
    pass

result.to_csv("./output/PCA_95_xgb_only.csv" , index = False)