In [189]:
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as f
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn import preprocessing
from tqdm import tqdm
from sklearn.preprocessing import FunctionTransformer
import seaborn as sns 
from scipy import stats
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
import h2o
from h2o.automl import H2OAutoML
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.neural_network import MLPRegressor
from scipy.stats import norm
import copy
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool, metrics, cv
import xgboost as xgb
from scipy.stats import gmean
from sklearn.impute import KNNImputer

In [187]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [194]:
def location_nom(train, test):
    scale = 14

    train.loc[:,'lat']=round(train.lat,scale)
    train.loc[:,'lon']=round(train.lon,scale)
    test.loc[:,'lat']=round(test.lat,scale)
    test.loc[:,'lon']=round(test.lon,scale)

    all_df = pd.concat([train, test], axis=0)
    all_df['loc_group'] = all_df.groupby(['lat','lon']).ngroup()
    train = all_df.iloc[:len(train)]
    test = all_df.iloc[len(train):].drop(target, axis=1)
    
    return train, test

def categorical_encode(train, test):
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    return train, test

def creat_new_featute(df):
    df['year'] = df['startdate'].dt.year
    df['month'] = df['startdate'].dt.month
    df['day_of_year'] = df['startdate'].dt.dayofyear
    return df

In [195]:
train_df, test_df = location_nom(train_df, test_df)

In [198]:
region_map = dict()
for i, col in enumerate(train_df['climateregions__climateregion'].unique()):
    region_map[col] = i
train_df['climateregions__climateregion'] = train_df['climateregions__climateregion'].map(region_map)
test_df['climateregions__climateregion'] = test_df['climateregions__climateregion'].map(region_map)

In [199]:
train_df=train_df.drop(['index'],axis=1)

In [200]:
train_df['month']=pd.DatetimeIndex(train_df['startdate']).month - 1
train_df['day']=pd.DatetimeIndex(train_df['startdate']).day - 1
test_df['month']=pd.DatetimeIndex(test_df['startdate']).month - 1
test_df['day']=pd.DatetimeIndex(test_df['startdate']).day - 1

In [217]:
train_df['year']=pd.DatetimeIndex(train_df['startdate']).year
test_df['year']=pd.DatetimeIndex(test_df['startdate']).year

In [202]:
train_df['month'].unique()

array([ 8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7])

In [205]:
def cal_season(month):
    if month == 11 or month == 0 or month == 1:
        return 3
    if month == 2 or month == 3 or month == 4:
        return 0
    if month == 5 or month == 6 or month == 7:
        return 1
    return 2
train_df['season'] = train_df['month'].apply(cal_season)
test_df['season'] = test_df['month'].apply(cal_season)

In [207]:
train_df.loc[:,train_df.dtypes!='float'].columns

Index(['startdate', 'climateregions__climateregion', 'loc_group', 'month',
       'day', 'season'],
      dtype='object')

In [209]:
cats = ['climateregions__climateregion', 'loc_group', 'season', 'month']

In [213]:
def missing_values_table(df):
        # Total missing values by column
        mis_val = df.isnull().sum()
        
        # Percentage of missing values by column
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # build a table with the thw columns
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# Missing values for training data
missing_values_train = missing_values_table(train_df)
missing_values_train[:20].style.background_gradient(cmap='Reds')

Your selected dataframe has 249 columns.
There are 8 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
nmme0-tmp2m-34w__ccsm30,15934,4.2
nmme0-prate-56w__ccsm30,15934,4.2
nmme0-prate-34w__ccsm30,15934,4.2
ccsm30,15934,4.2
nmme-tmp2m-56w__ccsm3,10280,2.7
nmme-prate-56w__ccsm3,10280,2.7
nmme-prate-34w__ccsm3,8738,2.3
nmme-tmp2m-34w__ccsm3,8738,2.3


In [214]:
knn_imputing = True
target=["contest-tmp2m-14d__tmp2m"]

In [218]:
train_df['source'] = 'train'
test_df['source']  = 'test'
df = pd.concat([train_df,test_df], 0, ignore_index=True)
groupby_cols = ['startdate'] + cats
df=df.sort_values(by=groupby_cols).reset_index(drop=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [219]:
tmp_df = df

In [None]:
if knn_imputing:
    imputer = KNNImputer(n_neighbors=7)
    cats_with_target = cats + target + ['startdate'] + ['source']
    tmp = df[cats_with_target]
    df = df.drop(tmp.columns, axis=1)
    df1 = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)

    joblib.dump(imputer, '../models/knn_imputer.pkl')

    for col in tmp.columns:
        df[col]=tmp[col]
    for col in df1.columns:
        df[col] = df1[col]