In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score, precision_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from time import time
import optuna
from lightgbm import early_stopping
from lightgbm import log_evaluation
import os

In [61]:
train_url = 'data/train_data.csv'
test_url = 'data/test_data.csv'

train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

In [62]:
train.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [79]:
test.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,loc
0,375734,0.0,0.833333,11/1/22,339.88,30.88,30.92,29.17,31.02,29.47,...,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27,0.0_0.8333
1,375735,0.0,0.833333,11/2/22,334.63,30.88,30.92,29.17,31.02,29.47,...,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16,0.0_0.8333
2,375736,0.0,0.833333,11/3/22,337.83,30.88,30.92,29.17,31.02,29.47,...,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42,0.0_0.8333
3,375737,0.0,0.833333,11/4/22,345.81,30.88,30.92,29.17,31.02,29.47,...,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.7,-18.62,10.69,0.0_0.8333
4,375738,0.0,0.833333,11/5/22,357.39,30.88,30.92,29.17,31.02,29.47,...,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64,0.0_0.8333


# Pre-processing

In [93]:
train['loc'] = train['lat'].round(4).astype('str') + '_' + train['lon'].round(4).astype('str')
test['loc'] = test['lat'].round(4).astype('str') + '_' + test['lon'].round(4).astype('str')

KeyError: 'lat'

In [94]:
# Test if any loc exists in test but not in train

n = len(list(set(test['loc']) - set(train['loc'])))
print(f'Number of loc exists in test but not in train: {n}')

Number of loc exists in test but not in train: 0


In [65]:
loc_ls = train['loc'].unique()

In [89]:
train.drop(columns=['lat', 'lon', 'startdate'], inplace=True)
test.drop(columns=['lat', 'lon', 'startdate'], inplace=True)

## Label Encoding

In [70]:
le = LabelEncoder()

le.fit(train['climateregions__climateregion'])

train['climateregions__climateregion'] = le.transform(train['climateregions__climateregion'])
test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])

## Drop drifted columns

In [71]:
from scipy.stats import ks_2samp

In [72]:
def fill_nulls(df):

    tmp = pd.DataFrame(df.isna().sum()) \
        .rename(columns={0: 'cnt'})

    null_ls = tmp[tmp.cnt != 0].reset_index()

    mean_values = {}

    for i in null_ls['index']:
        mean = df.loc[:, i].mean()
        mean_values[i] = mean

    for k, v in mean_values.items():
        df[k].fillna(value=v, inplace=True)

In [103]:
def train_model(data, target, model):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=42)

    model.fit(train_x, train_y,
              eval_set=[(test_x,test_y)],
              callbacks=[early_stopping(100), log_evaluation(100)]
              )
    return model

In [110]:
submit_path = 'result.csv'

start = time()

scaler = StandardScaler()
start_idx = 375734

for loc in loc_ls:
    print('---------------------------')
    print(f'Processing {loc}')

    loc_train = train[train['loc'] == loc].drop(columns=['contest-tmp2m-14d__tmp2m', 'loc'])

    fill_nulls(loc_train)

    loc_target = train.loc[train['loc'] == loc, 'contest-tmp2m-14d__tmp2m']

    loc_test = test[test['loc'] == loc].drop(columns='loc')

    # thrifted_col = []
    #
    # for col in loc_train.columns:
    #     pval = ks_2samp(loc_train[col], loc_test[col]).pvalue
    #
    #     if pval < 0.1:
    #         thrifted_col.append(col)
    #
    # loc_train.drop(columns=thrifted_col, inplace=True)
    # loc_test.drop(columns=thrifted_col, inplace=True)

    scaler.fit(loc_train)
    loc_train = scaler.transform(loc_train)
    loc_test = scaler.transform(loc_test)

    lgbm = LGBMRegressor(
        n_estimators=5000,
        n_jobs=-1,
    #     reg_alpha=0.42,
        reg_lambda=0.01,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
        learning_rate=0.005,
        # max_depth=20,
    #     num_leaves=430,
    #     min_child_samples=117
    )

    model = train_model(loc_train, loc_target, lgbm)
    pred_y = model.predict(loc_test)
    pred_y_train = model.predict(loc_train)

    print(f'RMSE on train set: {mean_squared_error(loc_target, pred_y_train)}')

    end_idx = start_idx + len(pred_y)
    submit_df = pd.DataFrame({'contest-tmp2m-14d__tmp2m': pred_y, 'index': range(start_idx, end_idx)})

    if not os.path.exists(submit_path):
        submit_df.to_csv(submit_path, index=False)
    else:
        submit_df.to_csv(submit_path, mode='a', header=False, index=False)

    print(f'Done {loc}')

    start_idx = end_idx

print(f'Done in {time() - start} s')

---------------------------
Processing 0.0_0.8333
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15.006
[200]	valid_0's l2: 5.60272
[300]	valid_0's l2: 2.20501
[400]	valid_0's l2: 0.935114
[500]	valid_0's l2: 0.455585
[600]	valid_0's l2: 0.279059
[700]	valid_0's l2: 0.212279
[800]	valid_0's l2: 0.183842
[900]	valid_0's l2: 0.167588
[1000]	valid_0's l2: 0.157891
[1100]	valid_0's l2: 0.152338
[1200]	valid_0's l2: 0.146991
[1300]	valid_0's l2: 0.142634
[1400]	valid_0's l2: 0.139598
[1500]	valid_0's l2: 0.137298
[1600]	valid_0's l2: 0.135346
[1700]	valid_0's l2: 0.133135
[1800]	valid_0's l2: 0.131367
[1900]	valid_0's l2: 0.129824
[2000]	valid_0's l2: 0.128169
[2100]	valid_0's l2: 0.126604
[2200]	valid_0's l2: 0.125737
[2300]	valid_0's l2: 0.124644
[2400]	valid_0's l2: 0.12369
[2500]	valid_0's l2: 0.123023
[2600]	valid_0's l2: 0.122464
[2700]	valid_0's l2: 0.121899
[2800]	valid_0's l2: 0.121364
[2900]	valid_0's l2: 0.12086
[3000]	valid_0's l2: 0.120497
[31

In [111]:
test = pd.read_csv('result.csv')
test.shape

(31354, 2)

In [113]:
test.shape

(31354, 2)