In [182]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
pd.options.display.max_rows = 1000
warnings.filterwarnings('ignore')

from feature_names import *
from utils import *

In [183]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from fastai.tabular.core import df_shrink

In [184]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created/'
else:
  from google.colab import drive
  from google.colab import output
  output.enable_custom_widget_manager()
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

In [185]:
df_train_temp = pd.read_csv(path_to_data + 'train_data_shrink.csv', parse_dates=[startdate])
df_test_temp = pd.read_csv(path_to_data + 'test_data_shrink.csv', parse_dates=[startdate])

In [265]:
df_train = df_train_temp.copy()
df_test = df_test_temp.copy()
dfs = [df_train, df_test]

In [266]:
for df in dfs:
  df.columns = df.columns.str.replace('-','_')

In [267]:
df_info_obj = df_info(df_train)
null_contained_cols = df_info_obj[df_info_obj['nan_count']>0].index
df_info_obj.loc[null_contained_cols]

Unnamed: 0,nan_count,nan_percent,unique_count,dtype
nmme0_tmp2m_34w__ccsm30,15934,4.2408,3840,float64
nmme_tmp2m_56w__ccsm3,10280,2.736,5000,float64
nmme_prate_34w__ccsm3,8738,2.3256,10139,float64
nmme0_prate_56w__ccsm30,15934,4.2408,5716,float64
nmme0_prate_34w__ccsm30,15934,4.2408,5712,float64
nmme_prate_56w__ccsm3,10280,2.736,10008,float64
nmme_tmp2m_34w__ccsm3,8738,2.3256,4810,float64
ccsm30,15934,4.2408,3833,float64


In [268]:
for df in dfs:
  df[year] = df[startdate].dt.year
  df[month] = df[startdate].dt.month
  df[week] = df[startdate].dt.week
  df[day] = df[startdate].dt.day

In [269]:
# fill some missing values of one type model predict with mean of all models
df_train[nmme0_tmp2m_34w__ccsm30] = df_train[nmme0_tmp2m_34w__ccsm30].fillna(df_train[nmme0_tmp2m_34w__nmme0mean])
df_train[nmme0_prate_34w__ccsm30] = df_train[nmme0_prate_34w__ccsm30].fillna(df_train[nmme0_prate_34w__nmme0mean])
df_train[nmme0_prate_56w__ccsm30] = df_train[nmme0_prate_56w__ccsm30].fillna(df_train[nmme0_prate_56w__nmme0mean])
df_train[nmme_tmp2m_34w__ccsm3] = df_train[nmme_tmp2m_34w__ccsm3].fillna(df_train[nmme_tmp2m_34w__nmmemean])
df_train[nmme_tmp2m_56w__ccsm3] = df_train[nmme_tmp2m_56w__ccsm3].fillna(df_train[nmme_tmp2m_56w__nmmemean])
df_train[nmme_prate_34w__ccsm3] = df_train[nmme_prate_34w__ccsm3].fillna(df_train[nmme_prate_34w__nmmemean])
df_train[nmme_prate_56w__ccsm3] = df_train[nmme_prate_56w__ccsm3].fillna(df_train[nmme_prate_56w__nmmemean])
df_train[ccsm30] = df_train[ccsm30].fillna(df_train[nmme0mean])

In [270]:
# handle unmatched lat, lon
for df in dfs:
  for kind in [lat,lon]:
    df[kind] = df[kind].round(4)

In [271]:
for df in dfs:
  df[location] = (df[lat].astype(str) + '_' + df[lon].astype(str))
  df['is_mei__nip_3'] = (df['mei__nip'] == 3).astype(int)

In [272]:
group_cols_1 = [location, month]
group_name_1 = '_'.join(group_cols_1)

cols1 = [
  nmme0_tmp2m_34w__cancm30,
  nmme0_tmp2m_34w__cancm40,
  nmme0_tmp2m_34w__ccsm30,
  nmme0_tmp2m_34w__ccsm40,
  nmme0_tmp2m_34w__cfsv20,
  nmme0_tmp2m_34w__gfdlflora0,
  nmme0_tmp2m_34w__gfdlflorb0,
  nmme0_tmp2m_34w__gfdl0,
  nmme0_tmp2m_34w__nasa0,
  nmme0_tmp2m_34w__nmme0mean,
] # ==> max, std
for col in cols1:
  for df in dfs:
    df[f'{group_name_1}_{col}_max'] = df.groupby(group_cols_1)[col].transform('max')
    df[f'{group_name_1}_{col}_std'] = df.groupby(group_cols_1)[col].transform('std')

cols2 = [
  nmme_tmp2m_34w__cancm3,
  nmme_tmp2m_34w__ccsm3,
  nmme_tmp2m_34w__ccsm4,
  nmme_tmp2m_34w__cfsv2,
  nmme_tmp2m_34w__gfdl,
  nmme_tmp2m_34w__gfdlflora,
  nmme_tmp2m_34w__gfdlflorb,
  nmme_tmp2m_34w__nasa,
  nmme_tmp2m_34w__nmmemean,
  
  nmme_tmp2m_56w__cancm4,
  nmme_tmp2m_56w__ccsm4,
  nmme_tmp2m_56w__cfsv2,
  nmme_tmp2m_56w__gfdl,
  nmme_tmp2m_56w__gfdlflora,
  nmme_tmp2m_56w__gfdlflorb,
  nmme_tmp2m_56w__nasa,
  nmme_tmp2m_56w__nmmemean,
]    # => max, min, mean, median

for col in cols2:
  for df in dfs:
    df[f'{group_name_1}_{col}_max'] = df.groupby(group_cols_1)[col].transform('max')
    df[f'{group_name_1}_{col}_min'] = df.groupby(group_cols_1)[col].transform('min')
    df[f'{group_name_1}_{col}_mean'] = df.groupby(group_cols_1)[col].transform('mean')
    df[f'{group_name_1}_{col}_median'] = df.groupby(group_cols_1)[col].transform('median')

cols3 = [
  cancm30,
  cancm40,
  cfsv20,
  gfdlflora0,
  gfdlflorb0,
  gfdl0,
  nasa0,
  nmme0mean,
] # => only std
for col in cols3:
  for df in dfs:
    df[f'{group_name_1}_{col}_std'] = df.groupby(group_cols_1)[col].transform('std')

cols4 = [
  contest_wind_vwnd_250_14d__wind_vwnd_250, 
  contest_wind_vwnd_925_14d__wind_vwnd_925, 
  contest_wind_uwnd_250_14d__wind_uwnd_250, 
  contest_wind_uwnd_925_14d__wind_uwnd_925, 
  contest_wind_h100_14d__wind_hgt_100, 
  contest_wind_h500_14d__wind_hgt_500, 
  contest_slp_14d__slp, 
  contest_pevpr_sfc_gauss_14d__pevpr, 
  contest_pres_sfc_gauss_14d__pres, 
  contest_precip_14d__precip, 
  contest_prwtr_eatm_14d__prwtr, 
] # => max, skew, median, mean

for col in cols4:
  for df in dfs:
    df[f'{group_name_1}_{col}_max'] = df.groupby(group_cols_1)[col].transform('max')
    df[f'{group_name_1}_{col}_skew'] = df.groupby(group_cols_1)[col].transform('skew')
    df[f'{group_name_1}_{col}_median'] = df.groupby(group_cols_1)[col].transform('median')
    df[f'{group_name_1}_{col}_mean'] = df.groupby(group_cols_1)[col].transform('mean')
for col in [*cols1, *cols2, *cols3, *cols4]:
  for df in dfs:
    for shift in [-1,1]:
      df[f'location_week_{col}_lag_{shift}'] = df.groupby([location, week])[col].shift(shift)

In [273]:
# drop unused columns
for df in dfs:
  df.drop(columns=[
    year,
    index,
    mei__mei,
    mei__meirank,
    mei__nip,
  ], inplace=True)

In [274]:
cat_cols = [
 'lat',
 'lon',
 'climateregions__climateregion',
 'mjo1d__phase',
 'location',
 'elevation__elevation'
]

from sklearn.preprocessing import LabelEncoder
# label encode categorical features
label_encoder = LabelEncoder()
for col in cat_cols:
  df_train[col] = label_encoder.fit_transform(df_train[col])
  df_test[col] = label_encoder.transform(df_test[col])

In [275]:
df_train.sort_values(by=[startdate], inplace=True)

In [276]:
train = df_train[df_train[startdate] < '2016-05-01'].copy()
valid = df_train[df_train[startdate] >= '2016-05-01'].copy()
train.drop(columns=[startdate], inplace=True)
valid.drop(columns=[startdate], inplace=True)
train.shape, valid.shape

((312512, 386), (63222, 386))

In [277]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [278]:
target = 'contest_tmp2m_14d__tmp2m'
X_train = train.drop(columns=[target])
y_train = train[target]
X_valid = valid.drop(columns=[target])
y_valid = valid[target]
X_submit = df_test.drop(columns=[startdate])
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_submit.shape)

(312512, 385) (312512,) (63222, 385) (63222,) (31354, 385)


In [279]:
model = CatBoostRegressor(
    random_seed=42,
    cat_features=['lat', 'lon', 'location', 'climateregions__climateregion', 'mjo1d__phase', 'month', 'week', 'elevation__elevation'],
    loss_function= 'RMSE',
    eval_metric= 'RMSE',
    # task_type= 'GPU',
    # devices= '0:1',
)
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    plot=True,
    verbose=0,
    early_stopping_rounds=200,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x29aec6710>

In [252]:
model.get_best_score()

{'learn': {'RMSE': 0.47570363738720206},
 'validation': {'RMSE': 1.0376959215476185}}

In [284]:
# feature importance dataframe
fi = pd.DataFrame()
fi['feature'] = X_train.columns
fi['importance'] = model.feature_importances_
fi.sort_values(by='importance', ascending=False, inplace=True)
fi['rank'] = range(1, len(fi) + 1)
fi.head(100)

Unnamed: 0,feature,importance,rank
85,contest_wind_h500_14d__wind_hgt_500,8.976517,1
77,nmme_tmp2m_34w__gfdlflorb,6.399873,2
56,contest_slp_14d__slp,6.121138,3
76,nmme_tmp2m_34w__gfdlflora,5.875622,4
297,location_month_nmme_tmp2m_34w__nmmemean_max,4.16751,5
80,contest_prwtr_eatm_14d__prwtr,3.463304,6
2,contest_pevpr_sfc_gauss_14d__pevpr,3.38582,7
293,location_month_nmme_tmp2m_34w__nasa_max,3.001337,8
309,location_month_nmme_tmp2m_56w__cfsv2_max,2.82424,9
74,nmme_tmp2m_34w__cfsv2,2.809933,10


In [280]:
submit_index = pd.read_csv('../Data/created/submit_index.csv')['index']
y_submit_pred = model.predict(df_test.drop(columns=[startdate]))
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': y_submit_pred,
  'index':submit_index
})
submit.to_csv('../submission/catboost.csv', index=False)

In [255]:
y_pred = xgb.predict(X_valid)
print('RMSE:', mean_squared_error(y_valid, y_pred))

RMSE: 1.4841360880316674


In [285]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import tensorflow_datasets as tfds
import tensorboard

In [286]:
target = 'contest_tmp2m_14d__tmp2m'
X_train = train.drop(columns=[target])
y_train = train[target]
X_valid = valid.drop(columns=[target])
y_valid = valid[target]
X_submit = df_test.drop(columns=[startdate])

cat_features=['lat', 'lon', 'location', 'climateregions__climateregion', 'mjo1d__phase', 'month', 'week', 'elevation__elevation']

target_encoder = TargetEncoder(cols=cat_features)
X_train = target_encoder.fit_transform(X_train, y_train)
X_valid = target_encoder.transform(X_valid)
X_submit = target_encoder.transform(X_submit)



In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
number_cols = [colum for colum in X_train.columns if colum not in cat_features]
X_train[number_cols] = scaler.fit_transform(X_train[number_cols])
X_valid[number_cols] = scaler.transform(X_valid[number_cols])
X_submit[number_cols] = scaler.transform(X_submit[number_cols])

In [None]:
def SimpleLayer():
  return keras.Sequential([
    layers.Dense(128),
    layers.BatchNormalization(),
    layers.Activation('relu'),
  ])

model = keras.Sequential([
    layers.Dense(256, input_shape=[X_train.shape[1]]),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    SimpleLayer(),
    SimpleLayer(),
    SimpleLayer(),
    SimpleLayer(),
  
    layers.Dropout(0.25),
    layers.Dense(1),
])

callbacks = [
    keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=20, factor=0.2),
    keras.callbacks.ModelCheckpoint('model.h5', save_best_only=True),
]

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mse'],
)

model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=100,
    batch_size=1024,
    callbacks=callbacks,
    verbose=0,
)


In [None]:
# load best model
model = keras.models.load_model('model.h5')

y_pred = model.predict(X_valid)
print('RMSE:', mean_squared_error(y_valid, y_pred))

In [None]:
submit_index = pd.read_csv('../Data/created/submit_index.csv')['index']
y_submit_pred = model.predict(X_submit).flatten()
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': y_submit_pred,
  'index':submit_index
})
submit.to_csv('../submission/catboost.csv', index=False)