In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy import stats


In [2]:
from sklearn.preprocessing import StandardScaler
from scipy import stats

def preprocess_data(data_path, labels_path=None):
    features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c',
                'precipitation_amt_mm',
                'ndvi_ne',
                'reanalysis_sat_precip_amt_mm',
                'ndvi_sw',
                'reanalysis_precip_amt_kg_per_m2',
                'station_min_temp_c',
#                 'precipitation_amt_mm', 
                'week_start_date']
    df = pd.read_csv(data_path, index_col=[0, 1, 2])
            
    df['station_avg_temp_c_mv_avg'] = df['station_avg_temp_c'].rolling(window=50).mean()
    df['precipitation_amt_mm_mv_avg'] = df['precipitation_amt_mm'].rolling(window=50).mean()
    features.append('station_avg_temp_c_mv_avg')
    features.append('precipitation_amt_mm_mv_avg') 
    
    df['reanalysis_sat_precip_amt_mm'] =  df['reanalysis_sat_precip_amt_mm'].shift(-20)
    
    df['ndvi_ne_avg'] = df['ndvi_ne'].rolling(window=10).mean()
    features.append('ndvi_ne_avg')
    
    df['ndvi_sw_avg'] = df['ndvi_sw'].rolling(window=30).mean().shift(-10)
    features.append('ndvi_sw_avg')
    
    
    df['reanalysis_precip_amt_kg_per_m2_avg'] = df['reanalysis_precip_amt_kg_per_m2'].rolling(window=50).mean()
    features.append('reanalysis_precip_amt_kg_per_m2_avg')
    
    
    
    df['reanalysis_specific_humidity_g_per_kg_avg'] = df['reanalysis_specific_humidity_g_per_kg'].rolling(window=50).mean()
    features.append('reanalysis_specific_humidity_g_per_kg_avg')
    
    
    df['reanalysis_dew_point_temp_k_avg'] = df['reanalysis_dew_point_temp_k'].rolling(window=35).mean()
    features.append('reanalysis_dew_point_temp_k_avg')

    
    df.fillna(method='ffill', inplace=True)
    df = df.fillna(df.mean())
    
    df['week_start_date'] = pd.to_datetime(df['week_start_date'])
    for i in range(1,5):
        df['quarter_' + str(i)] = df['week_start_date'].apply(lambda date: 1 if (
            ((i-1)*3<date.month) and (date.month <= i * 3)) else 0)
        features.append('quarter_' + str(i))
    
    df = df.drop(['week_start_date'], axis=1)
    features.remove('week_start_date')
    df = df[features]    
    sj_label = None
    iq_label = None
    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2]).loc[df.index]
        sj_label = pd.DataFrame(labels.loc['sj'])
        iq_label = pd.DataFrame(labels.loc['iq'])

    sj = pd.DataFrame(df.loc['sj'])
    iq = pd.DataFrame(df.loc['iq'])
    
    
    return sj, iq, sj_label, iq_label

In [3]:
sj_train, iq_train, sj_label, iq_label = preprocess_data('./data/train_features.csv', './data/train_labels.csv')
print(sj_train.shape)
sj_train.head()


(936, 20)


Unnamed: 0_level_0,Unnamed: 1_level_0,reanalysis_specific_humidity_g_per_kg,reanalysis_dew_point_temp_k,station_avg_temp_c,precipitation_amt_mm,ndvi_ne,reanalysis_sat_precip_amt_mm,ndvi_sw,reanalysis_precip_amt_kg_per_m2,station_min_temp_c,station_avg_temp_c_mv_avg,precipitation_amt_mm_mv_avg,ndvi_ne_avg,ndvi_sw_avg,reanalysis_precip_amt_kg_per_m2_avg,reanalysis_specific_humidity_g_per_kg_avg,reanalysis_dew_point_temp_k_avg,quarter_1,quarter_2,quarter_3,quarter_4
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990,18,14.012857,292.414286,25.442857,12.42,0.1226,143.73,0.177617,32.0,20.0,27.041766,44.730679,0.134275,0.202801,39.061375,16.700091,295.315161,0,1,0,0
1990,19,15.372857,293.951429,26.714286,22.82,0.1699,51.39,0.155486,17.94,22.2,27.041766,44.730679,0.134275,0.202801,39.061375,16.700091,295.315161,0,1,0,0
1990,20,16.848571,295.434286,26.714286,34.54,0.03225,31.18,0.170843,26.1,22.8,27.041766,44.730679,0.134275,0.202801,39.061375,16.700091,295.315161,0,1,0,0
1990,21,16.672857,295.31,27.471429,15.36,0.128633,51.42,0.235886,13.9,23.3,27.041766,44.730679,0.134275,0.202801,39.061375,16.700091,295.315161,0,1,0,0
1990,22,17.21,295.821429,28.942857,7.52,0.1962,143.55,0.24734,12.2,23.9,27.041766,44.730679,0.134275,0.202801,39.061375,16.700091,295.315161,0,1,0,0


In [4]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

# sj_train.head()

sj_train_X, sj_test_X, sj_train_y, sj_test_y = train_test_split(sj_train, sj_label['total_cases'], test_size=0.1, random_state=0, shuffle=False)

iq_train_X, iq_test_X, iq_train_y, iq_test_y = train_test_split(iq_train, iq_label['total_cases'], test_size=0.25, random_state=0, shuffle=False)

In [5]:
def find_best_model(sj_train_X, sj_test_X, sj_train_y, sj_test_y, iq_train_X, iq_test_X, iq_train_y, iq_test_y):
    best_sj_model = None
    sj_error = 1000
    for i in range (10):
        
        sj_model = RandomForestRegressor(n_estimators=200, max_depth=6,
                                         criterion='mae', warm_start=True)

        sj_model.fit(sj_train_X, sj_train_y)
        sj_pred_val = sj_model.predict(sj_test_X)
        error = mean_absolute_error(sj_test_y, sj_pred_val)
        print ("SJ "+str(i)+" : "+str(error))
        if error < sj_error:
            best_sj_model = sj_model
            sj_error = error

    print("SJ " + str(sj_error))
    
    best_iq_model = None 
    iq_error = 1000
    for i in range(10):
        
        iq_model = RandomForestRegressor(n_estimators=20, max_features='auto',
                                         max_depth=6, min_samples_leaf=0.005,
                                         criterion='mae', min_weight_fraction_leaf=0.1
                                        , warm_start=True)
        iq_model.fit(iq_train_X, iq_train_y)
        iq_pred_val = iq_model.predict(iq_test_X)
        error = mean_absolute_error(iq_test_y, iq_pred_val)
        print("IQ "+str(i)+" : "+str(error))
        if error < iq_error:
            best_iq_model  = iq_model 
            iq_error = error
    print("IQ " + str(iq_error))
    
    return best_sj_model, best_iq_model

sj_model, iq_model = find_best_model(sj_train_X, sj_test_X, sj_train_y, sj_test_y, iq_train_X, iq_test_X, iq_train_y, iq_test_y)


SJ 0 : 16.927287234042552
SJ 1 : 17.283936170212765
SJ 2 : 17.588856382978726
SJ 3 : 17.430638297872342
SJ 4 : 17.048138297872338
SJ 5 : 17.21566489361702
SJ 6 : 17.52390957446809
SJ 7 : 17.27646276595744
SJ 8 : 17.346941489361704
SJ 9 : 17.44742021276596
SJ 16.927287234042552
IQ 0 : 7.952115384615385
IQ 1 : 8.010384615384615
IQ 2 : 7.989807692307692
IQ 3 : 8.034807692307693
IQ 4 : 7.936346153846153
IQ 5 : 7.903076923076924
IQ 6 : 7.914038461538462
IQ 7 : 7.994423076923078
IQ 8 : 7.988076923076922
IQ 9 : 8.02903846153846
IQ 7.903076923076924


In [6]:

sj_test, iq_test, sj_test_label, iq_test_label = preprocess_data('./data/dengue_features_test.csv')

sj_predictions = sj_model.predict(sj_test).astype(int)
iq_predictions = iq_model.predict(iq_test).astype(int)

submission = pd.read_csv("./data/submission .csv", index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])

submission.to_csv("./results/submission_latest_.csv")