In [21]:
import numpy as np
import pandas as pd
import util
import ml_alg
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
train_path = 'dataSets/training/'
test_path = "dataSets/testing_phase1/"

In [3]:
train_df = pd.read_csv(train_path + 'data_2_0_final.csv', index_col=0)
test_df = pd.read_csv(test_path + 'data_2_0_final.csv', index_col=0)

In [4]:
y_train = train_df.pop('volume')
test_df.drop(['volume'], axis=1, inplace=True)

In [5]:
all_df = pd.concat((train_df, test_df), axis=0)

In [6]:
all_df.shape

(2172, 570)

In [7]:
all_df.head()

Unnamed: 0_level_0,pressure,sea_pressure,wind_direction,wind_speed,temperature,rel_humidity,precipitation,dayofweek,hour,A2_1,...,vt0_1_0_11,vt0_1_1_11,vt0_2_0_11,vt0_3_0_11,vt0_3_1_11,vt1_1_0_11,vt1_1_1_11,vt1_2_0_11,vt1_3_0_11,vt1_3_1_11
time_window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-09-19 00:00:00,1008.2,1013.2,329,2.8,22.2,76,0.0,0,0,,...,,,,,,,,,,
2016-09-19 00:20:00,1008.2,1013.2,329,2.8,22.2,76,0.0,0,0,100.19,...,,,,,,,,,,
2016-09-19 00:40:00,1008.2,1013.2,329,2.8,22.2,76,0.0,0,0,52.63,...,,,,,,,,,,
2016-09-19 01:00:00,1008.2,1013.2,329,2.8,22.2,76,0.0,0,1,48.146667,...,,,,,,,,,,
2016-09-19 01:20:00,1008.2,1013.2,329,2.8,22.2,76,0.0,0,1,68.013478,...,,,,,,,,,,


In [8]:
all_df.isnull().sum()

pressure           0
sea_pressure       0
wind_direction     0
wind_speed         0
temperature        0
rel_humidity       0
precipitation      0
dayofweek          0
hour               0
A2_1              71
A3_1              71
B1_1              71
B3_1              71
C1_1              71
C3_1              71
ve0_1_0_1         71
ve0_1_1_1         71
ve0_2_0_1         71
ve0_3_0_1         71
ve0_3_1_1         71
ve1_1_0_1         71
ve1_1_1_1         71
ve1_2_0_1         71
ve1_3_0_1         71
ve1_3_1_1         71
vm0_1_0_1         71
vm0_1_1_1         71
vm0_2_0_1         71
vm0_3_0_1         71
vm0_3_1_1         71
                  ..
vm1_1_0_11        81
vm1_1_1_11        81
vm1_2_0_11        81
vm1_3_0_11        81
vm1_3_1_11        81
vm2_1_0_11        81
vm2_1_1_11        81
vm2_2_0_11        81
vm2_3_0_11        81
vm2_3_1_11        81
vm3_1_0_11        81
vm3_1_1_11        81
vm3_2_0_11        81
vm3_3_0_11        81
vm3_3_1_11        81
volume_1_0_11     81
volume_1_1_11

In [9]:
all_df['dayofweek'].dtypes

dtype('int64')

In [10]:
dweek = pd.get_dummies(all_df['dayofweek'], prefix='dayofweek')

In [11]:
dhour = pd.get_dummies(all_df['hour'], prefix='hour')

In [12]:
numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std(ddof=0)
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std

In [13]:
all_df.drop(['hour'], axis=1, inplace=True)
all_df.drop(['dayofweek'], axis=1, inplace=True)
all_df = pd.concat([all_df, dweek, dhour], axis=1)

In [14]:
all_df.isnull().sum()

pressure             0
sea_pressure         0
wind_direction       0
wind_speed           0
temperature          0
rel_humidity         0
precipitation        0
A2_1                71
A3_1                71
B1_1                71
B3_1                71
C1_1                71
C3_1                71
ve0_1_0_1           71
ve0_1_1_1           71
ve0_2_0_1           71
ve0_3_0_1           71
ve0_3_1_1           71
ve1_1_0_1           71
ve1_1_1_1           71
ve1_2_0_1           71
ve1_3_0_1           71
ve1_3_1_1           71
vm0_1_0_1           71
vm0_1_1_1         2172
vm0_2_0_1           71
vm0_3_0_1           71
vm0_3_1_1         2172
vm1_1_0_1           71
vm1_1_1_1           71
                  ... 
volume_2_0_11       81
volume_3_0_11       81
volume_3_1_11       81
vt0_1_0_11        2172
vt0_1_1_11          81
vt0_2_0_11        2172
vt0_3_0_11        2172
vt0_3_1_11          81
vt1_1_0_11        2172
vt1_1_1_11          81
vt1_2_0_11        2172
vt1_3_0_11        2172
vt1_3_1_11 

# Training want to predict 8:00 - 8:20

In [15]:
dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]

In [16]:
dummy_train_df_1 = dummy_train_df.ix['2016-09-19 02:00:00':]

In [18]:
y_train1 = y_train.ix['2016-09-19 02:00:00':]

In [19]:
X_train = dummy_train_df.values
X_test = dummy_test_df.values

## Try ridge regression as baseline. 

In [20]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import time

### As MAPE is not listed in sklearn. Write my own metric. 

In [None]:
from sklearn.metrics import make_scorer
def MAPE(ground_truth, predictions):
    ground_truth[ground_truth == 0] = math.inf
    diff = np.abs((ground_truth - predictions)/ground_truth).mean()
    return diff

loss  = make_scorer(MAPE, greater_is_better=False)

In [22]:
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    start_time = time.time()
    clf = Ridge(alpha)
    test_score = cross_val_score(clf, X_train, y_train, cv=10, scoring=loss).mean()
    test_scores.append(test_score)
    print("alpha:" + str(alpha) + " loss = " + str(test_score))
    print("time:" + str(time.time() - start_time))

-6.9077552789821368