In [None]:
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd

# 60-min models: DengAI

# Wrangle Data

## Import

In [None]:
X = pd.read_csv('data/dengue_features_train.csv', 
                 parse_dates=['week_start_date'],
                 index_col='week_start_date')

## EDA

### Feature Matrix

In [None]:
X.shape

In [None]:
X.info()

In [None]:
X.head()

In [None]:
X['city'].value_counts()

In [None]:
X.describe(exclude='number')

### Target

In [None]:
y = pd.read_csv('data/dengue_labels_train.csv')

In [None]:
y.head()

In [None]:
y.describe(exclude='number')

In [None]:
cols = ['city', 'year', 'weekofyear']
X_values = X[cols].values
y_values = y[cols].values

In [None]:
X_values[:4]

In [None]:
y_values[:4]

In [None]:
assert np.array_equal(X_values, y_values)

In [None]:
y = pd.read_csv('data/dengue_labels_train.csv', usecols=['total_cases']).set_index(X.index)
y = y['total_cases']

In [None]:
y.head()

In [None]:
print(X.shape)

In [None]:
print(y.shape)

# Split Data

In [None]:
len(X)*.8

In [None]:
X.iloc[1164,:]

In [None]:
cutoff = '2004-11-18'
mask = X.index < cutoff
X_train, y_train = X.loc[mask], y.loc[mask]
X_val, y_val = X.loc[~mask], y.loc[~mask]

In [None]:
print(X_train.shape)

In [None]:
print(X_val.shape)

# Establish Baseline

In [None]:
mean_absolute_error(y_train, [y_train.mean()] * len(y_train))

# Build Model

In [None]:
model = make_pipeline(
    OneHotEncoder(cols=['city']),
    SimpleImputer(),
    LinearRegression()
)

In [None]:
model.fit(X_train, y_train);

# Check Metrics

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model.predict(X_train)))
print('Validation MAE:', mean_absolute_error(y_val, model.predict(X_val)))

# Make our predictions

In [None]:
X_test = pd.read_csv('data/dengue_features_test.csv', 
                 parse_dates=['week_start_date'],
                 index_col='week_start_date')

In [None]:
X_test.head()

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[:10]