In [61]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression

# 60-min models: DengAI

# Wrangle Data

## Import

In [31]:
X = pd.read_csv('data/dengue_features_train.csv', 
                 parse_dates=['week_start_date'],
                 index_col='week_start_date')

## EDA

### Feature Matrix

In [32]:
X.shape

(1456, 23)

In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1456 entries, 1990-04-30 to 2010-06-25
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   1456 non-null   object 
 1   year                                   1456 non-null   int64  
 2   weekofyear                             1456 non-null   int64  
 3   ndvi_ne                                1262 non-null   float64
 4   ndvi_nw                                1404 non-null   float64
 5   ndvi_se                                1434 non-null   float64
 6   ndvi_sw                                1434 non-null   float64
 7   precipitation_amt_mm                   1443 non-null   float64
 8   reanalysis_air_temp_k                  1446 non-null   float64
 9   reanalysis_avg_temp_k                  1446 non-null   float64
 10  reanalysis_dew_point_temp_k            1446 non-null  

In [34]:
X.head()

Unnamed: 0_level_0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-04-30,sj,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1990-05-07,sj,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
1990-05-14,sj,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
1990-05-21,sj,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
1990-05-28,sj,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [12]:
df['city'].value_counts()

sj    936
iq    520
Name: city, dtype: int64

In [13]:
df.describe(exclude='number')

Unnamed: 0,city
count,1456
unique,2
top,sj
freq,936


### Target

In [14]:
y = pd.read_csv('data/dengue_labels_train.csv')

In [15]:
y.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6


In [19]:
y.describe(exclude='number')

Unnamed: 0,city
count,1456
unique,2
top,sj
freq,936


In [35]:
cols = ['city', 'year', 'weekofyear']
X_values = X[cols].values
y_values = y[cols].values

In [23]:
X_values[:4]

array([['sj', 1990, 18],
       ['sj', 1990, 19],
       ['sj', 1990, 20],
       ['sj', 1990, 21]], dtype=object)

In [25]:
y_values[:4]

array([['sj', 1990, 18],
       ['sj', 1990, 19],
       ['sj', 1990, 20],
       ['sj', 1990, 21]], dtype=object)

In [36]:
assert np.array_equal(X_values, y_values)

In [48]:
y = pd.read_csv('data/dengue_labels_train.csv', usecols=['total_cases']).set_index(X.index)
y = y['total_cases']

In [49]:
y.head()

week_start_date
1990-04-30    4
1990-05-07    5
1990-05-14    4
1990-05-21    3
1990-05-28    6
Name: total_cases, dtype: int64

In [50]:
print(X.shape)

(1456, 23)


In [51]:
print(y.shape)

(1456,)


# Split Data

In [52]:
len(X)*.8

1164.8

In [53]:
X.iloc[1164,:]

city                                           iq
year                                         2004
weekofyear                                     47
ndvi_ne                                  0.227871
ndvi_nw                                    0.1945
ndvi_se                                  0.190914
ndvi_sw                                  0.209557
precipitation_amt_mm                        74.08
reanalysis_air_temp_k                     299.419
reanalysis_avg_temp_k                     301.236
reanalysis_dew_point_temp_k               296.097
reanalysis_max_air_temp_k                   309.3
reanalysis_min_air_temp_k                   293.6
reanalysis_precip_amt_kg_per_m2              26.8
reanalysis_relative_humidity_percent      84.6114
reanalysis_sat_precip_amt_mm                74.08
reanalysis_specific_humidity_g_per_kg     17.7386
reanalysis_tdtr_k                         11.6429
station_avg_temp_c                          28.25
station_diur_temp_rng_c                        11


In [54]:
cutoff = '2004-11-18'
mask = X.index < cutoff
X_train, y_train = X.loc[mask], y.loc[mask]
X_val, y_val = X.loc[~mask], y.loc[~mask]

In [55]:
print(X_train.shape)

(985, 23)


In [56]:
print(X_val.shape)

(471, 23)


# Establish Baseline

In [57]:
# NOTE TO SELF. ADD LATER.

# Build Model

In [64]:
model = make_pipeline(
    OneHotEncoder(cols=['city']),
    SimpleImputer(),
    LinearRegression()
)

In [65]:
model.fit(X_train, y_train);

  elif pd.api.types.is_categorical(cols):


In [66]:
from sklearn.metrics import mean_absolute_error

In [67]:
print('Training MAE:', mean_absolute_error(y_train, model.predict(X_train)))
print('Validation MAE:', mean_absolute_error(y_val, model.predict(X_val)))

Training MAE: 23.22193219244818
Validation MAE: 19.493472988768538


# Make our predictions

In [68]:
X_test = pd.read_csv('data/dengue_features_test.csv', 
                 parse_dates=['week_start_date'],
                 index_col='week_start_date')

In [69]:
X_test.head()

Unnamed: 0_level_0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-04-29,sj,2008,18,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,...,25.37,78.781429,78.6,15.918571,3.128571,26.528571,7.057143,33.3,21.7,75.2
2008-05-06,sj,2008,19,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,298.557143,...,21.83,78.23,12.56,15.791429,2.571429,26.071429,5.557143,30.0,22.2,34.3
2008-05-13,sj,2008,20,-0.0015,,0.151083,0.091529,3.66,299.455714,299.357143,...,4.12,78.27,3.66,16.674286,4.428571,27.928571,7.785714,32.8,22.8,3.0
2008-05-20,sj,2008,21,,-0.019867,0.124329,0.125686,0.0,299.69,299.728571,...,2.2,73.015714,0.0,15.775714,4.342857,28.057143,6.271429,33.3,24.4,0.3
2008-05-27,sj,2008,22,0.0568,0.039833,0.062267,0.075914,0.76,299.78,299.671429,...,4.36,74.084286,0.76,16.137143,3.542857,27.614286,7.085714,33.3,23.3,84.1


In [70]:
y_pred = model.predict(X_test)

In [72]:
y_pred[:10]

array([ 3.14922692, -0.35606161, -2.6706645 ,  2.50460874,  6.83808864,
        6.15493921,  7.62023352,  6.93617947,  9.12909585,  8.28687106])