In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split

In [2]:
features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c',
                'precipitation_amt_mm',
                'ndvi_ne',
                'reanalysis_sat_precip_amt_mm',
                'ndvi_sw',
                'reanalysis_precip_amt_kg_per_m2',
                'station_min_temp_c']
#                 'precipitation_amt_mm', 
                #'week_start_date']

In [3]:
df = pd.read_csv('./data/train_features.csv', index_col=[0, 1, 2])
tf = pd.read_csv('./data/dengue_features_test.csv')

In [4]:
df['station_avg_temp_c_mv_avg'] = df['station_avg_temp_c'].rolling(window=50).mean()
tf['station_avg_temp_c_mv_avg'] = tf['station_avg_temp_c'].rolling(window=50).mean()
features.append('station_avg_temp_c_mv_avg')

In [5]:
df['precipitation_amt_mm_mv_avg'] = df['precipitation_amt_mm'].rolling(window=50).mean()
tf['precipitation_amt_mm_mv_avg'] = tf['precipitation_amt_mm'].rolling(window=50).mean()
features.append('precipitation_amt_mm_mv_avg') 

In [6]:
df['reanalysis_sat_precip_amt_mm'] =  df['reanalysis_sat_precip_amt_mm'].shift(-20)
tf['reanalysis_sat_precip_amt_mm'] =  tf['reanalysis_sat_precip_amt_mm'].shift(-20)

In [7]:
df['ndvi_ne_avg'] = df['ndvi_ne'].rolling(window=10).mean()
tf['ndvi_ne_avg'] = tf['ndvi_ne'].rolling(window=10).mean()
features.append('ndvi_ne_avg')

In [8]:
df['ndvi_sw_avg'] = df['ndvi_sw'].rolling(window=30).mean().shift(-10)
tf['ndvi_sw_avg'] = tf['ndvi_sw'].rolling(window=30).mean().shift(-10)
features.append('ndvi_sw_avg')

In [9]:
df['reanalysis_precip_amt_kg_per_m2_avg'] = df['reanalysis_precip_amt_kg_per_m2'].rolling(window=50).mean()
tf['reanalysis_precip_amt_kg_per_m2_avg'] = tf['reanalysis_precip_amt_kg_per_m2'].rolling(window=50).mean()
features.append('reanalysis_precip_amt_kg_per_m2_avg')

In [10]:
df['reanalysis_specific_humidity_g_per_kg_avg'] = df['reanalysis_specific_humidity_g_per_kg'].rolling(window=50).mean()
tf['reanalysis_specific_humidity_g_per_kg_avg'] = tf['reanalysis_specific_humidity_g_per_kg'].rolling(window=50).mean()
features.append('reanalysis_specific_humidity_g_per_kg_avg')

In [11]:
df['reanalysis_dew_point_temp_k_avg'] = df['reanalysis_dew_point_temp_k'].rolling(window=35).mean()
tf['reanalysis_dew_point_temp_k_avg'] = tf['reanalysis_dew_point_temp_k'].rolling(window=35).mean()
features.append('reanalysis_dew_point_temp_k_avg')

In [12]:
df.fillna(method='ffill', inplace=True)
df = df.fillna(df.mean())
tf.fillna(method='ffill', inplace=True)
tf = tf.fillna(df.mean())

In [13]:
df = df[features]   
tf = tf[features]

In [14]:
def get_data_labels(labels_path = None):   
    sj_label = None
    iq_label = None
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2]).loc[df.index]
        sj_label = pd.DataFrame(labels.loc['sj'])
        iq_label = pd.DataFrame(labels.loc['iq'])

    sj = pd.DataFrame(df.loc['sj'])
    iq = pd.DataFrame(df.loc['iq'])
    
    
    return sj, iq, sj_label, iq_label

In [15]:
sj_train, iq_train, sj_label, iq_label = get_data_labels( './data/train_labels.csv')

In [16]:
sj_train_X, sj_test_X, sj_train_y, sj_test_y = train_test_split(sj_train, sj_label['total_cases'], test_size=0.1, random_state=0, shuffle=False)

iq_train_X, iq_test_X, iq_train_y, iq_test_y = train_test_split(iq_train, iq_label['total_cases'], test_size=0.25, random_state=0, shuffle=False)

In [17]:
sj_model = RandomForestRegressor(n_estimators=200, max_depth=6,
                                         criterion='mae', warm_start=True)

sj_model.fit(sj_train_X, sj_train_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=True)

In [18]:
iq_model = RandomForestRegressor(n_estimators=20, max_features='auto',
                                         max_depth=6, min_samples_leaf=0.005,
                                         criterion='mae', min_weight_fraction_leaf=0.1
                                        , warm_start=True)
iq_model.fit(iq_train_X, iq_train_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=0.005, min_samples_split=2,
           min_weight_fraction_leaf=0.1, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=True)

In [19]:
sj_test, iq_test, sj_test_label, iq_test_label =get_data_labels()
print(sj_test.shape)

(936, 16)


In [20]:
sj_predictions = sj_model.predict(sj_test).astype(int)
iq_predictions = iq_model.predict(iq_test).astype(int)


In [21]:
submission = pd.read_csv("./data/submission .csv", index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])

submission.to_csv("./results/3_submission_latest_6.csv")

ValueError: Length of values does not match length of index