In [1]:
! git clone https://github.com/Ironarrow98/dengue_data

Cloning into 'dengue_data'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.


In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

%matplotlib inline

test_features = pd.read_csv("/content/dengue_data/dengue_features_test.csv")
train_features = pd.read_csv("/content/dengue_data/dengue_features_train.csv")
train_targets = pd.read_csv("/content/dengue_data/dengue_labels_train.csv")

train_features = train_features.interpolate(method = 'linear')
test_features = test_features.interpolate(method = 'linear')

In [40]:
train_targets.drop(['year'], axis=1, inplace=True)
train_targets['week_start_date'] = train_features['week_start_date']
sj_train = train_features[train_features['city'] == 'sj'].drop('city', 1)
iq_train = train_features[train_features['city'] == 'iq'].drop('city', 1)
sj_target = train_targets[train_targets['city'] == 'sj'].drop('city', 1)
iq_target = train_targets[train_targets['city'] == 'iq'].drop('city', 1)

In [41]:
records_sj = sj_train
records_iq = iq_train

In [42]:
records_iq[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']] -= 273.15
records_sj[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']] -= 273.15

# records_iq['ndvi'] = records_iq[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
# records_iq.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1, inplace=True)


# records_sj['ndvi_n'] = records_sj[['ndvi_ne', 'ndvi_nw']].mean(axis=1)
# records_sj['ndvi_s'] = records_sj[['ndvi_se', 'ndvi_sw']].mean(axis=1)
records_sj.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1, inplace=True)

# records_sj['reanalysis_temp'] = records_sj[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
records_sj.drop(['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 
                 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k'], axis=1, inplace=True)

records_iq['station_temp'] = records_iq[['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']].interpolate().mean(axis=1)
records_iq.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                 'station_max_temp_c', 'station_min_temp_c'], axis=1, inplace=True)

records_sj['station_temp'] = records_sj[['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']].interpolate().mean(axis=1)
records_sj.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                 'station_max_temp_c', 'station_min_temp_c'], axis=1, inplace=True)

In [43]:
records_sj.set_index('week_start_date', inplace=True)
records_iq.set_index('week_start_date', inplace=True)
labels_sj = sj_target.set_index('week_start_date')
labels_iq = iq_target.set_index('week_start_date')

In [37]:
def predict_for(records, labels, ntrain, lencycle, features=['station_temp']):
  lr_seasonal = LinearRegression()
  weeks = pd.get_dummies(records['weekofyear'], prefix='w')
  train_weeks = weeks[:ntrain].reset_index().drop('week_start_date', axis=1)
  test_weeks = weeks[ntrain:].reset_index().drop('week_start_date', axis=1)
  train_cases = labels[['total_cases']].reset_index().drop('week_start_date', axis=1)

  lr_seasonal.fit(train_weeks, train_cases)
  seasonal = pd.Series(lr_seasonal.predict(train_weeks).flatten()).rolling(5, min_periods=1, center=True).mean()
  remainder = train_cases.total_cases - seasonal

  trend = records[features].reset_index().drop('week_start_date', axis=1).rolling(lencycle).mean()
  train_trend = trend[lencycle:ntrain]
  test_trend = trend[ntrain:]
  train_remainder = remainder[lencycle:]

  lr_trend = LinearRegression()
  lr_trend.fit(train_trend, train_remainder)
    
  train_pred_trend = pd.Series(lr_trend.predict(train_trend).flatten())
  print('mae: ' + str(mean_absolute_error(y_pred=train_pred_trend.values + seasonal[lencycle:].values,
                                          y_true=train_cases['total_cases'][lencycle:].values)))

  pred_seasonal = pd.Series(lr_seasonal.predict(test_weeks).flatten())
  pred_trend = pd.Series(lr_trend.predict(test_trend).flatten())

  pred = (pred_trend + pred_seasonal).rolling(5, min_periods=1, center=True).mean()

  return pred

In [46]:
records_iq

Unnamed: 0_level_0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_precip_mm,station_temp
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2000-07-01,2000,26,0.192886,0.132257,0.340886,0.247200,25.41,23.590000,298.450000,22.034286,34.15,19.95,43.19,92.418571,25.41,16.651429,8.928571,3.0,26.533333
2000-07-08,2000,27,0.216833,0.276100,0.289457,0.241657,60.61,23.484286,298.428571,22.208571,33.45,17.95,46.00,93.581429,60.61,16.862857,10.314286,55.6,27.233333
2000-07-15,2000,28,0.176757,0.173129,0.204114,0.128014,55.52,23.265714,297.392857,22.472857,31.35,19.45,64.77,95.848571,55.52,17.120000,7.385714,38.1,26.833333
2000-07-22,2000,29,0.227729,0.145429,0.254200,0.200314,5.60,22.207143,296.228571,19.647143,30.45,15.45,23.96,87.234286,5.60,14.431429,9.114286,30.0,23.988889
2000-07-29,2000,30,0.328643,0.322129,0.254371,0.361043,62.76,23.282857,297.635714,20.807143,33.85,18.35,31.80,88.161429,62.76,15.444286,9.500000,4.0,26.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-05-28,2010,21,0.342750,0.318900,0.256343,0.292514,55.30,26.184286,300.771429,23.675714,36.55,21.35,45.00,88.765714,55.30,18.485714,9.800000,27.0,28.811111
2010-06-04,2010,22,0.160157,0.160371,0.136043,0.225657,86.47,25.180000,299.392857,23.302857,35.35,18.75,207.10,91.600000,86.47,18.070000,7.471429,36.6,27.944444
2010-06-11,2010,23,0.247057,0.146057,0.250357,0.233714,58.94,23.448571,297.592857,22.351429,32.35,19.25,50.60,94.280000,58.94,17.008571,7.500000,7.4,25.266667
2010-06-18,2010,24,0.333914,0.245771,0.278886,0.325486,59.67,23.195714,297.521429,22.174286,32.95,18.75,62.33,94.660000,59.67,16.815714,7.871429,16.0,25.877778


In [47]:
pred_iq = predict_for(records_iq, labels_iq, 520, 53, [
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent', 
    'station_temp'])
pred_sj = predict_for(records_sj, labels_sj, 936, 53, [
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent',
    'station_temp'])

                 w_1  w_2  w_3  w_4  w_5  ...  w_49  w_50  w_51  w_52  w_53
week_start_date                           ...                              
2000-07-01         0    0    0    0    0  ...     0     0     0     0     0
2000-07-08         0    0    0    0    0  ...     0     0     0     0     0
2000-07-15         0    0    0    0    0  ...     0     0     0     0     0
2000-07-22         0    0    0    0    0  ...     0     0     0     0     0
2000-07-29         0    0    0    0    0  ...     0     0     0     0     0
...              ...  ...  ...  ...  ...  ...   ...   ...   ...   ...   ...
2010-05-28         0    0    0    0    0  ...     0     0     0     0     0
2010-06-04         0    0    0    0    0  ...     0     0     0     0     0
2010-06-11         0    0    0    0    0  ...     0     0     0     0     0
2010-06-18         0    0    0    0    0  ...     0     0     0     0     0
2010-06-25         0    0    0    0    0  ...     0     0     0     0     0

[520 rows x

ValueError: ignored

In [None]:
sj_pred = pd.Series(map(int, map(round, pred_sj)))
iq_pred = pd.Series(map(int, map(round, pred_iq)))

submission = pd.read_csv('/content/dengue_data/submission_format.csv')
submission_ = (sj_pred.append(iq_pred, ignore_index=True)).clip(lower = 0)
submission['total_cases'] = submission_
submission.to_csv('ts_submission.csv', index = False)