In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import prepro_util
import os

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

In [2]:
### read train data
train = pd.read_csv('../../../Desktop/wids/train_data.csv')

In [84]:
### read test data
test = pd.read_csv('../../../Desktop/wids/test_data.csv')

## PCA

In [70]:
def PCA_transform(dataset , variance , target):
    
    ### split predictor and predicted
    x , y = prepro_util.x_y_split(dataset , target)

    ### scale
    sc = preprocessing.StandardScaler()
    scaled_x = pd.DataFrame(sc.fit_transform(x) , columns = x.columns)

    ### PCA
    pca_model = PCA(n_components = variance , svd_solver = 'full')
    transformed = pca_model.fit_transform(scaled_x)
    
    ### make dataframe with new components
    columns = []
    for i in range(1 , transformed.shape[1] + 1):
        columns.append(f"component {i}")
    transformed_df = pd.DataFrame(transformed , columns = columns)

    ### combine transformed x with y
    transformed_df[target] = y
    
    return transformed_df

### preprocessing

In [5]:
### target column
target = 'contest-tmp2m-14d__tmp2m'

In [37]:
### test data
pre_train = prepro_util.preprocess_data(x_train , 4 , "mean" , target)

index and datetime set
categorical data encoded
location data handled
datetime handled


### PCA

In [92]:
PCA_pre_train = PCA_transform(pre_train , 0.95 , target)

In [93]:
### split the data
X = pre_train[[col for col in pre_train.columns if col != target]]
y = pre_train[target]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

### XGBOOST

In [105]:
### train the model - XGBoost
model_xgb = xgb.XGBRegressor(booster = 'gbtree',
                             subsample = 0.8,
                             eta = 0.1, 
                             n_estimaters = 15000,
                             colsample_bytree = 0.4,
                             max_depth = 5,
                             tree_method = 'hist',
                             eval_metric = 'rmse', 
                             objective = 'reg:squarederror')

model_xgb.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)], verbose = 100)

Parameters: { "n_estimaters" } are not used.

[0]	validation_0-rmse:13.59272	validation_1-rmse:13.58423
[99]	validation_0-rmse:1.01022	validation_1-rmse:1.02123


In [106]:
### use RMSE to evaluate
y_pred_xgb = model_xgb.predict(x_test)
mse = mean_squared_error(y_pred_xgb, y_test)

print("MSE : " ,mse)

### save model
model_xgb.save_model("./models/PCA_95_xgb_md_5.json")

MSE :  1.0429156895041272


## XGB predict

#### Load model

In [107]:
xgb_model = xgb.Booster()
xgb_model.load_model("./models/PCA_95_xgb_md_5.json")

#### Predict

In [108]:
test_dataset = xgb.DMatrix(pre_test)
predict_y = xgb_model.predict(test_dataset)

#### Output result

In [109]:
xgb_result = test.copy()
xgb_result[target] = predict_y
xgb_result = xgb_result.reset_index()
xgb_result = pd.DataFrame(xgb_result[['index' , target]] , columns = ['index' , target])

#### Store result

In [111]:
### create directory
try:
    os.makedirs("output")
except:
    pass

xgb_result.to_csv("./output/PCA_95_xgb_mean_fill_maxD_5.csv" , index = False)