In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

In [2]:
### read test and train data
a = pd.read_csv('../../../Desktop/wids/train_data.csv')
train = pd.read_csv('../../../Desktop/wisd/train_data.csv')
#test = pd.read_csv('../../../Desktop/wisd/test_data.csv')

print(train.shape)
#print(test.shape)

(375734, 246)
(31354, 245)


## Data preparation

In [3]:
### check dtypes
set(train.dtypes)

### check categorical columns
train[[col for col in train.columns if train[col].dtypes == "O"]].head()

Unnamed: 0,startdate,climateregions__climateregion
0,9/1/14,BSh
1,9/2/14,BSh
2,9/3/14,BSh
3,9/4/14,BSh
4,9/5/14,BSh


**Categorical data:** \
    1. startdate\
    2. climateregions__climateregion

In [32]:
### set index
def set_index_and_to_dt(dataset):
    dataset = dataset.set_index('index')
    dataset['startdate'] = pd.to_datetime(dataset['startdate'] , format = '%m/%d/%y')
    print('index and datetime set')
    return dataset

### categorical data encoding ("climate regions")
def encode_categorical_data(dataset):
    encode = preprocessing.LabelEncoder()
    dataset['climateregions__climateregion'] = encode.fit_transform(dataset['climateregions__climateregion'])
    print('categorical data encoded')
    return dataset

### location data handling (round "lat" and "lon")
def location_data_handle(dataset , round_to):
    dataset.loc[: ,'lat'] = round(dataset.loc[:,'lat'], round_to)
    dataset.loc[: , 'lon'] = round(dataset.loc[: , 'lon'] , round_to)
    print('location data handled')
    return dataset


### calculate missing value with mean
def na_imputer(dataset):
    df = dataset.copy()
    df = df.sort_values(by = ['lat', 'lon' , 'startdate'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    for col in dataset.columns:
        if dataset[col].dtypes != "O":
            if dataset[col].dtypes != np.dtype('<M8[ns]'):            
                df[col] = imputer.fit_transform(dataset.loc[ : , col].values.reshape(-1 ,1))
        else:
            continue
    return df

### handle na
def handle_na(dataset , fill_type):
    df = dataset.copy()
    if fill_type == 'f':
        df = dataset.sort_values(['lat' , 'lon' , 'startdate']).ffill()
    elif fill_type == 'b':
        df = dataset.sort_values(['lat' , 'lon' , 'startdate']).ffill()
    else:
        if fill_type == 'mean':
            df = na_imputer(dataset)
        else:
            return df
    return df

### handle datatime (column startdate)
def handle_datetime(dataset):
    df = dataset.copy()
    df['year'] = dataset['startdate'].dt.year
    df['month'] = dataset['startdate'].dt.month
    df['day'] = dataset['startdate'].dt.dayofyear
    print('datetime handled')
    return df

### split predictor and predicted
def x_y_split(dataset , target_column):
    x = dataset[[col for col in dataset.columns if col != target_column]]
    y = dataset[target_column]
    return x , y



### preprocess data function
def preprocess_data(dataset , round_to_which_digit , fill_type , target_column):
    
    temp_df = dataset.copy()
    temp_df = set_index_and_to_dt(temp_df) ### set index and convert datetime
    
    try:
        temp_df , y = x_y_split(temp_df , target_column)
    except:
        pass
    
    temp_df = encode_categorical_data(temp_df) ### categorical data encoding
    temp_df = location_data_handle(temp_df , round_to_which_digit) ### longtitude and latitude handling
    temp_df = handle_na(temp_df , fill_type) ### handle na
    temp_df = handle_datetime(temp_df) ### create year, month, day
    
    temp_df = temp_df.drop(['startdate'] , axis = 1) ### drop processed column
    
    try:
        temp_df = temp_df.merge(y , left_index = True, right_index = True) ### merge y value back
    except:
        pass
    return temp_df

### preprocessing

In [26]:
### target column
target = 'contest-tmp2m-14d__tmp2m'

In [37]:
### preprocess training data -> rounding lat , lon to 4th + filling na using "mean"
pro_train = preprocess_data(train , 4 , "mean" , target)

index and datetime set
categorical data encoded
location data handled
datetime handled


In [41]:
### split the data
X = train[[col for col in train.columns if col != target]]
y = train[target]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)


In [None]:
### train the model - XGBoost
model_xgb = xgb.XGBRegressor(booster = 'gbtree',
                             subsample = 0.8,
                             eta = 0.1, 
                             n_estimaters = 15000,
                             colsample_bytree = 0.4,
                             max_depth = 4,
                             tree_method = 'hist',
                             eval_metric = 'rmse', 
                             objective = 'reg:squarederror')

model_xgb.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)], verbose = 100)

In [None]:
### use RMSE to evaluate
y_pred_xgb = model_xgb.predict(x_test)
mse = mean_squared_error(y_pred_xgb, y_test)

print("MSE : " ,mse)

### save model
model_xgb.save_model("./models/model_mean_fill.json")

In [None]:
### train the model - CatBoost
model_cat = CatBoostRegressor(n_estimators = 15000,
                              eval_metric = 'RMSE',
                              learning_rate = 0.1, 
                              verbose = 1,
                              random_seed = 0).fit(x_train, y_train)

model_cat.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_test, y_test)], verbose = 50)

In [None]:
### use RMSE to evaluate
y_pred_cat = model_cat.predict(x_test)
mean_squared_error(y_pred_cat, y_test)

In [None]:
### ensemble the results
result = model_xgb.predict(X_test) * 0.2 + model_cat.predict(X_test) * 0.8
result

In [None]:
# print out the result
submit[target] = result
submit.to_csv('submission.csv', index = False)