In [107]:
import numpy as np
from scipy.stats.mstats import mode
from copy import deepcopy
import pandas as pd
import time
%matplotlib inline
from helper import *
from IDWmodel import *

In [108]:
# ### Building a reference table with average daily value of the sensor
def build_avg_time_table(df_train):
    
    # different averaging does not seem to have an effect...
    
    #df_train['day_time'] = df_train.time % 10000
    # 5min
    df_train['day_time'] = (df_train['time'] % (100 * 100)) // 100 * 100 + ((df_train['time'] % 100) // 5) * 5
    
    # 10min
    #df_train['day_time'] = (df_train['time'] % (100 * 100)) // 100 * 100 + ((df_train['time'] % 100) // 10) * 10
    # 30 min
    #df_train['day_time'] = (df_train['time'] % (100 * 100)) // 100 * 100 + ((df_train['time'] % 100) // 30) * 30
    
    
    # Initializing the dataframe
    # Update: rounding the value
    col_name = 'S1'
    df_day_avg_values = df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round)

    col_names = ['S'+str(i) for i in xrange(1, 57)]
    for col_name in col_names[1:]:
        df_day_avg_values = df_day_avg_values.join(df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round))
        
    return df_day_avg_values

In [109]:
df_train = load_train_data()

In [110]:
df_IDWmodel = pd.read_csv('data/IDWmodel_train.csv')

In [111]:
df_day_avg_values = build_avg_time_table(df_train)
df_day_avg_values.head(20)

Unnamed: 0_level_0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,S47,S48,S49,S50,S51,S52,S53,S54,S55,S56
day_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
df_day_avg_values = build_avg_time_table(df_IDWmodel)

In [113]:
np.max(df_day_avg_values.max())

2.0

In [121]:
# add staircase/restroom binary features
# augment manually

In [200]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import linear_model


def prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model, prediction_model, do_rounding = False):
    staircaseA_nodes = ['S42', 'S46']
    staircaseB_nodes = ['S34', 'S35']
    staircaseC_nodes = ['S52', 'S53']
    
    # Dataframe to store the model prediction
    df_model_lr = df_train.copy()
    
    for col in col_names:
        # X will store the features and the outcome Y
        X = df_train.copy()
        X = X.rename(columns={col:'Y'})
        X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
        X = X.rename(columns={col:col+'avg'})

        # Building the neighbors (from adjacency list) with missing values filled as in model
        neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
        X = X[['Y']].join(df_model[neighbors_col])
        
        # augment with staircase info
        X['sA'] = (col in staircaseA_nodes) * 1.
        X['sB'] = (col in staircaseB_nodes) * 1.
        X['sC'] = (col in staircaseC_nodes) * 1.

        X_train = X[X['Y'] != -1]
        X_test = X[X['Y'] == -1]
        test_indices = X[X['Y'] == -1].index
        col_values = X['Y']

        if len(X_test):
            # Models
            prediction_model = prediction_model.fit(X_train.drop('Y', axis=1), X_train.Y)
            col_values.ix[test_indices] = prediction_model.predict(X_test.drop('Y', axis=1))

            # Filling the result with the current sensor prediction
            if do_rounding:
                df_model_lr[col] = np.round(col_values)
            else:
                df_model_lr[col] = col_values
    return df_model_lr

In [201]:
col_names = ['S'+str(i) for i in xrange(1, 57)]
adjacency_list = compute_adjlist(27.)

In [215]:
clf = linear_model.LassoLarsCV(positive=True, max_iter=1500)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_IDWmodel, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf, do_rounding = True) 
# clf = linear_model.LassoLarsCV(positive=True, max_iter=1500)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_IDWmodel, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
# df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf, do_rounding = True) # 5.56

In [216]:
create_submission_file(df_model_lr, 'models/lr_model_leo_v6.csv')