In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from time import time
import os

from lightgbm import LGBMRegressor

In [95]:
train_url = 'data/train_data.csv'
test_url = 'data/test_data.csv'

df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)

## Fill nulls

In [96]:
def fill_nulls(df):

    tmp = pd.DataFrame(df.isna().sum()) \
        .rename(columns={0: 'cnt'})

    null_ls = tmp[tmp.cnt != 0].reset_index()

    mean_values = {}

    for i in null_ls['index']:
        mean = df.loc[:, i].mean()
        mean_values[i] = mean

    for k, v in mean_values.items():
        df[k].fillna(value=v, inplace=True)

fill_nulls(df_train)

## Feature Engineering

In [97]:
column_date="startdate"

def get_idx(lat, lon):
    return str(round(lat, 4)) + "_" + str(round(lon, 4))

def get_date(date, idx=0):
    return int(date.split('/')[idx])

df_train["idx"] = np.vectorize(get_idx)(df_train['lat'], df_train['lon'])
df_train["month"] = np.vectorize(get_date)(df_train[column_date], idx=0)
df_train["day"] = np.vectorize(get_date)(df_train[column_date], idx=1)
df_train["year"] = np.vectorize(get_date)(df_train[column_date], idx=2)

df_test["idx"] = np.vectorize(get_idx)(df_test['lat'], df_test['lon'])
df_test["month"] = np.vectorize(get_date)(df_test[column_date], idx=0)
df_test["day"] = np.vectorize(get_date)(df_test[column_date], idx=1)
df_test["year"] = np.vectorize(get_date)(df_test[column_date], idx=2)

In [99]:
tmp2m_cols = [each for each in df_train.columns if 'tmp2m' in each and each != 'contest-tmp2m-14d__tmp2m']

# Add a dummy column to df_test
# df_test['contest-tmp2m-14d__tmp2m'] = 0

df_train.drop(columns=tmp2m_cols, inplace=True)
df_test.drop(columns=tmp2m_cols, inplace=True)

In [100]:
df_test.shape

(31354, 219)

In [102]:
df_train.shape

(375734, 220)

## Label Encoding

In [103]:
le = LabelEncoder()

le.fit(df_train['climateregions__climateregion'])

df_train['climateregions__climateregion'] = le.transform(df_train['climateregions__climateregion'])
df_test['climateregions__climateregion'] = le.transform(df_test['climateregions__climateregion'])

In [104]:
df_train.shape

(375734, 220)

In [105]:
df_test.shape

(31354, 219)

# Calculate difference

In [None]:
df_train = df_train.sort_values('startdate')
df_test = df_test.sort_values('startdate')

def cal_dif(df):
    for col in df.columns:
        if col not in ['idx', 'startdate', 'contest-tmp2m-14d__tmp2m', 'month', 'day', 'year', 'lat', 'lon']:
            print(f'Processing {col}')
            df[f'{col}_diff'] = df.groupby('idx')[col].diff()

    return df

df_train_diff = cal_dif(df_train)
df_test_diff = cal_dif(df_test)

In [109]:
df_train_diff = df_train_diff.fillna(0)
df_test_diff = df_test_diff.fillna(0)

In [115]:
df_train_diff.drop(columns=['idx', 'startdate'], inplace=True)
df_test_diff.drop(columns=['idx', 'startdate'], inplace=True)

## Visualize

In [11]:
col_diff_ls = [col for col in df_train.columns if '_diff' in col]

In [50]:
# n_cols = 4
# n_rows = len(col_diff_ls) // n_cols + (len(col_diff_ls) % n_cols != 0)
# f, axes = plt.subplots(n_rows, n_cols, figsize=(20, 250))
# count_cols, count_rows = 0, 0
#
# for col in df_train.columns:
#     if '_diff' in col:
#         sns.kdeplot(x=col, data=df_train, ax=axes[count_rows][count_cols])
#         sns.kdeplot(x=col, data=df_test, ax=axes[count_rows][count_cols])
#
#         count_cols += 1
#         count_rows += 1 if count_cols % n_cols == 0 else 0
#         count_cols = count_cols % n_cols
#
# f.tight_layout(h_pad=6)

# Model

In [112]:
def split_x_y(df):
    x = df.drop(columns='contest-tmp2m-14d__tmp2m')
    y = df['contest-tmp2m-14d__tmp2m']

    return x, y

In [117]:
lgbm = LGBMRegressor(
    n_estimators=5000,
    n_jobs=4,
    subsample=0.7,
    colsample_bytree=0.7
)

train_x, train_y = split_x_y(df_train_diff)

lgbm.fit(train_x, train_y)

LGBMRegressor(colsample_bytree=0.7, n_estimators=5000, n_jobs=4, subsample=0.7)

In [120]:
submit = lgbm.predict(df_test_diff)

In [121]:
submit_df = pd.DataFrame({'contest-tmp2m-14d__tmp2m': submit, 'index': range(375734, 375734+len(submit))})

In [122]:
submit_df

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,26.457203,375734
1,18.998058,375735
2,18.539939,375736
3,16.396256,375737
4,15.826679,375738
...,...,...
31349,13.863253,407083
31350,14.078452,407084
31351,6.241839,407085
31352,10.604138,407086


In [125]:
submit_df.to_csv('submit.csv', index=False)