In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import prepro_util
import os

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import catboost
from catboost import CatBoostRegressor
from catboost import Pool, CatBoostClassifier

In [2]:
### read train data
train = pd.read_csv('../../../../Desktop/wids/train_data.csv')

In [None]:
### read test data
test = pd.read_csv('../../../../Desktop/wids/test_data.csv')

## Data preprocessing

In [3]:
### target column
target = 'contest-tmp2m-14d__tmp2m'

In [4]:
### preprocess train data
pre_train = prepro_util.preprocess_data(train , 4 , "mean" , target)

index and datetime set
location data handled
datetime handled


### Split data

In [58]:
x_train , x_test , y_train , y_test = train_test_split(pre_train[[col for col in pre_train.columns if col != target]],\
                                                       pre_train[target] , test_size = 0.3)

## CAT model training

In [42]:
def cat_boost(depth , rate , x_train , y_train):
### train the model - CatBoost
    model = CatBoostRegressor(cat_features = ['climateregions__climateregion'],
                                  max_depth = depth,
                                  n_estimators = 20000,
                                  eval_metric = 'RMSE',
                                  learning_rate = rate,
                                  verbose = 1,
                                  random_seed = 0).fit(x_train, y_train)
     
    return model

In [54]:
model_list = []
spec_list = []
testing_list = []

### loops
for depth in range(3 , 11): ### max depth adjustment
    for rate in range(1 , 9): ### learning rate adjustment
        ### model making
        temp_model = cat_boost(depth, rate / 100 ,\
                               small_train[[col for col in small_train.columns if col != target]],\
                               small_train[target])
        ### prediciton
        ### use RMSE to evaluate
        y_pred = temp_model.predict(x_test)
        mse = mean_squared_error(y_pred, y_test)        
        
        ### store result
        test_string = f"max_d_{depth}_learn_r_{rate}"
        model_list.append(temp_model)
        spec_list.append({"spec" : test_string})
        testing_list.append([f"{test_string}" , mse])
        
try:
    os.mkdir('models')
except:
    pass

### save model
for model_index in range(len(model_list)):
    model_list[model_index].save_model(f"./models/CATboost_{spec_list[model_index]['spec']}.json")
    
    
### RMSE test result
df = pd.DataFrame(testing_list , columns = ['model_name' , 'RMSE'])
df.to_csv("model_test_result.csv" , index = False)

0:	learn: 0.2253855	total: 570us	remaining: 10.8ms
1:	learn: 0.2248841	total: 1.07ms	remaining: 9.65ms
2:	learn: 0.2244210	total: 1.63ms	remaining: 9.22ms
3:	learn: 0.2239503	total: 2.1ms	remaining: 8.38ms
4:	learn: 0.2232865	total: 2.54ms	remaining: 7.63ms
5:	learn: 0.2226386	total: 2.93ms	remaining: 6.84ms
6:	learn: 0.2218792	total: 3.41ms	remaining: 6.34ms
7:	learn: 0.2212832	total: 4.14ms	remaining: 6.21ms
8:	learn: 0.2206550	total: 4.72ms	remaining: 5.77ms
9:	learn: 0.2201666	total: 5.31ms	remaining: 5.31ms
10:	learn: 0.2197525	total: 5.91ms	remaining: 4.84ms
11:	learn: 0.2189805	total: 6.53ms	remaining: 4.36ms
12:	learn: 0.2183337	total: 7.08ms	remaining: 3.81ms
13:	learn: 0.2177314	total: 7.54ms	remaining: 3.23ms
14:	learn: 0.2170822	total: 8.06ms	remaining: 2.69ms
15:	learn: 0.2164831	total: 8.61ms	remaining: 2.15ms
16:	learn: 0.2158281	total: 9.23ms	remaining: 1.63ms
17:	learn: 0.2152915	total: 9.81ms	remaining: 1.09ms
18:	learn: 0.2146840	total: 10.4ms	remaining: 549us
19:	le

3:	learn: 0.2189523	total: 3.5ms	remaining: 14ms
4:	learn: 0.2173400	total: 4.33ms	remaining: 13ms
5:	learn: 0.2155450	total: 5.34ms	remaining: 12.5ms
6:	learn: 0.2137181	total: 6.14ms	remaining: 11.4ms
7:	learn: 0.2121498	total: 6.99ms	remaining: 10.5ms
8:	learn: 0.2108483	total: 8.19ms	remaining: 10ms
9:	learn: 0.2089507	total: 9ms	remaining: 9ms
10:	learn: 0.2070560	total: 9.73ms	remaining: 7.96ms
11:	learn: 0.2054397	total: 10.5ms	remaining: 7.01ms
12:	learn: 0.2036865	total: 11.3ms	remaining: 6.09ms
13:	learn: 0.2018518	total: 12ms	remaining: 5.14ms
14:	learn: 0.2000529	total: 12.7ms	remaining: 4.24ms
15:	learn: 0.1988576	total: 13.2ms	remaining: 3.29ms
16:	learn: 0.1971428	total: 13.8ms	remaining: 2.44ms
17:	learn: 0.1960631	total: 14.4ms	remaining: 1.6ms
18:	learn: 0.1942755	total: 15.3ms	remaining: 803us
19:	learn: 0.1930928	total: 16.1ms	remaining: 0us
0:	learn: 0.2235783	total: 623us	remaining: 11.8ms
1:	learn: 0.2209012	total: 1.22ms	remaining: 11ms
2:	learn: 0.2186649	total

0:	learn: 0.2230951	total: 1.03ms	remaining: 19.7ms
1:	learn: 0.2205107	total: 2.34ms	remaining: 21.1ms
2:	learn: 0.2175193	total: 3.46ms	remaining: 19.6ms
3:	learn: 0.2147623	total: 4.27ms	remaining: 17.1ms
4:	learn: 0.2120071	total: 5.36ms	remaining: 16.1ms
5:	learn: 0.2093175	total: 7.35ms	remaining: 17.2ms
6:	learn: 0.2060810	total: 8.87ms	remaining: 16.5ms
7:	learn: 0.2029119	total: 10.6ms	remaining: 15.9ms
8:	learn: 0.1998003	total: 11.8ms	remaining: 14.5ms
9:	learn: 0.1970482	total: 13.2ms	remaining: 13.2ms
10:	learn: 0.1946931	total: 14.3ms	remaining: 11.7ms
11:	learn: 0.1923071	total: 15.1ms	remaining: 10.1ms
12:	learn: 0.1899812	total: 16.3ms	remaining: 8.75ms
13:	learn: 0.1880145	total: 17ms	remaining: 7.27ms
14:	learn: 0.1857597	total: 17.6ms	remaining: 5.86ms
15:	learn: 0.1836293	total: 18.6ms	remaining: 4.65ms
16:	learn: 0.1812588	total: 19.3ms	remaining: 3.4ms
17:	learn: 0.1794277	total: 20.4ms	remaining: 2.27ms
18:	learn: 0.1772781	total: 22ms	remaining: 1.16ms
19:	lear

0:	learn: 0.2230951	total: 2.09ms	remaining: 39.7ms
1:	learn: 0.2203948	total: 4.72ms	remaining: 42.5ms
2:	learn: 0.2174219	total: 6.49ms	remaining: 36.7ms
3:	learn: 0.2146500	total: 7.59ms	remaining: 30.4ms
4:	learn: 0.2113066	total: 8.6ms	remaining: 25.8ms
5:	learn: 0.2078147	total: 9.22ms	remaining: 21.5ms
6:	learn: 0.2052486	total: 11.1ms	remaining: 20.6ms
7:	learn: 0.2022325	total: 11.8ms	remaining: 17.8ms
8:	learn: 0.1997872	total: 13ms	remaining: 15.9ms
9:	learn: 0.1967975	total: 14.1ms	remaining: 14.1ms
10:	learn: 0.1938412	total: 15ms	remaining: 12.3ms
11:	learn: 0.1909805	total: 16.5ms	remaining: 11ms
12:	learn: 0.1890147	total: 17.8ms	remaining: 9.57ms
13:	learn: 0.1864274	total: 18.5ms	remaining: 7.91ms
14:	learn: 0.1840816	total: 19.9ms	remaining: 6.64ms
15:	learn: 0.1819013	total: 20.6ms	remaining: 5.16ms
16:	learn: 0.1795860	total: 22ms	remaining: 3.89ms
17:	learn: 0.1775792	total: 23ms	remaining: 2.55ms
18:	learn: 0.1754698	total: 25.3ms	remaining: 1.33ms
19:	learn: 0.1

0:	learn: 0.2213960	total: 2.42ms	remaining: 46ms
1:	learn: 0.2168800	total: 6.39ms	remaining: 57.5ms
2:	learn: 0.2122846	total: 7.61ms	remaining: 43.1ms
3:	learn: 0.2079660	total: 9.15ms	remaining: 36.6ms
4:	learn: 0.2028148	total: 10.4ms	remaining: 31.2ms
5:	learn: 0.1975011	total: 11.2ms	remaining: 26.1ms
6:	learn: 0.1935909	total: 14ms	remaining: 26ms
7:	learn: 0.1890989	total: 15.8ms	remaining: 23.7ms
8:	learn: 0.1854367	total: 17.3ms	remaining: 21.2ms
9:	learn: 0.1810495	total: 18.7ms	remaining: 18.7ms
10:	learn: 0.1774285	total: 20.6ms	remaining: 16.8ms
11:	learn: 0.1739790	total: 22.5ms	remaining: 15ms
12:	learn: 0.1707200	total: 24.3ms	remaining: 13.1ms
13:	learn: 0.1673056	total: 26.3ms	remaining: 11.3ms
14:	learn: 0.1642436	total: 27ms	remaining: 9.01ms
15:	learn: 0.1609587	total: 28.7ms	remaining: 7.18ms
16:	learn: 0.1575972	total: 29.5ms	remaining: 5.2ms
17:	learn: 0.1546156	total: 31.2ms	remaining: 3.47ms
18:	learn: 0.1515233	total: 33.3ms	remaining: 1.75ms
19:	learn: 0.1

19:	learn: 0.2028066	total: 29.6ms	remaining: 0us
0:	learn: 0.2242299	total: 1.9ms	remaining: 36.1ms
1:	learn: 0.2225144	total: 7.88ms	remaining: 70.9ms
2:	learn: 0.2207082	total: 9.3ms	remaining: 52.7ms
3:	learn: 0.2190087	total: 10.6ms	remaining: 42.5ms
4:	learn: 0.2170278	total: 12.4ms	remaining: 37.1ms
5:	learn: 0.2148693	total: 13.1ms	remaining: 30.5ms
6:	learn: 0.2132773	total: 16ms	remaining: 29.8ms
7:	learn: 0.2113943	total: 16.7ms	remaining: 25ms
8:	learn: 0.2098598	total: 17.7ms	remaining: 21.6ms
9:	learn: 0.2079671	total: 19.1ms	remaining: 19.1ms
10:	learn: 0.2060790	total: 20ms	remaining: 16.3ms
11:	learn: 0.2042424	total: 21.1ms	remaining: 14.1ms
12:	learn: 0.2029799	total: 22.5ms	remaining: 12.1ms
13:	learn: 0.2012959	total: 23.8ms	remaining: 10.2ms
14:	learn: 0.1998038	total: 28ms	remaining: 9.34ms
15:	learn: 0.1984056	total: 30.4ms	remaining: 7.59ms
16:	learn: 0.1966737	total: 32.1ms	remaining: 5.66ms
17:	learn: 0.1947951	total: 32.6ms	remaining: 3.62ms
18:	learn: 0.193

0:	learn: 0.2242299	total: 1.96ms	remaining: 37.2ms
1:	learn: 0.2225144	total: 7.91ms	remaining: 71.2ms
2:	learn: 0.2207082	total: 9.64ms	remaining: 54.6ms
3:	learn: 0.2190087	total: 11.3ms	remaining: 45.1ms
4:	learn: 0.2170278	total: 12.1ms	remaining: 36.4ms
5:	learn: 0.2148693	total: 12.9ms	remaining: 30.1ms
6:	learn: 0.2132773	total: 15.7ms	remaining: 29.2ms
7:	learn: 0.2113943	total: 16.2ms	remaining: 24.3ms
8:	learn: 0.2098598	total: 17.4ms	remaining: 21.3ms
9:	learn: 0.2079671	total: 18.4ms	remaining: 18.4ms
10:	learn: 0.2060790	total: 19.1ms	remaining: 15.7ms
11:	learn: 0.2042424	total: 21.2ms	remaining: 14.2ms
12:	learn: 0.2029799	total: 22.5ms	remaining: 12.1ms
13:	learn: 0.2012959	total: 23.4ms	remaining: 10ms
14:	learn: 0.1998038	total: 26.3ms	remaining: 8.76ms
15:	learn: 0.1984056	total: 28.1ms	remaining: 7.01ms
16:	learn: 0.1966737	total: 29.2ms	remaining: 5.14ms
17:	learn: 0.1947951	total: 29.6ms	remaining: 3.29ms
18:	learn: 0.1937099	total: 30ms	remaining: 1.58ms
19:	lea

## Predict

In [None]:
### test data
pre_test = prepro_util.preprocess_data(test , 4 , "mean" , target)

### PCA
n_components = 49
component_name_list = []
for i in range(1 , n_components + 1):
    component_name_list.append(f"component {i}")
    
pca = PCA(n_components = 49)
PCA_pre_test = pca.fit_transform(pre_test)
PCA_pre_test = pd.DataFrame(PCA_pre_test , columns = component_name_list)

## Models import and prediction output

In [None]:
### CATboost
cat_model = catboost.CatBoostClassifier()
cat_model = cat_model.load_model("../models/PCA_95_cat.json")

In [None]:
### CATboost calculation
cat_predict_y = cat_model.predict(PCA_pre_test , prediction_type = 'RawFormulaVal')

In [None]:
result = test.copy()
result[target] = cat_predict_y
result = result.reset_index()

result = pd.DataFrame(result[['index' , target]] , columns = ['index' , target])

In [None]:
### create directory
try:
    os.makedirs("output")
except:
    pass

result.to_csv("../output/PCA_95_xgb_only.csv" , index = False)