In [1]:
import numpy as np
from scipy.stats.mstats import mode
from copy import deepcopy
import pandas as pd
import time
%matplotlib inline
from helper import *
from IDWmodel import *

In [25]:
# ### Building a reference table with average daily value of the sensor
def build_avg_time_table(df_train):
    df_train['day_time'] = df_train.time % 10000

    # Initializing the dataframe
    # Update: rounding the value
    col_name = 'S1'
    df_day_avg_values = df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round)

    col_names = ['S'+str(i) for i in xrange(1, 57)]
    for col_name in col_names[1:]:
        df_day_avg_values = df_day_avg_values.join(df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round))
        
    return df_day_avg_values

In [26]:
df_train = load_train_data()

In [27]:
df_IDWmodel = pd.read_csv('data/IDWmodel_train.csv')

In [37]:
df_day_avg_values = build_avg_time_table(df_train)
df_day_avg_values.head(20)

Unnamed: 0_level_0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,S47,S48,S49,S50,S51,S52,S53,S54,S55,S56
day_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
np.max(df_day_avg_values.max())

4.0

In [39]:
from sklearn.linear_model import LinearRegression

def lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model):
    # Dataframe to store the model prediction
    df_model_lr = df_train.copy()
    
    for col in col_names:
        # X will store the features and the outcome Y
        X = df_train.copy()
        X = X.rename(columns={col:'Y'})
        X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
        X = X.rename(columns={col:col+'avg'})

        # Building the neighbors (from adjacency list) with missing values filled as in model
        neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
        X = X[['Y']].join(df_model[neighbors_col])

        X_train = X[X['Y'] != -1]
        X_test = X[X['Y'] == -1]
        test_indices = X[X['Y'] == -1].index
        col_values = X['Y']

        if len(X_test):
            # Models
            lr = LinearRegression()
            lr = lr.fit(X_train.drop('Y', axis=1), X_train.Y)
            col_values.ix[test_indices] = lr.predict(X_test.drop('Y', axis=1))

            # Filling the result with the current sensor prediction
            df_model_lr[col] = col_values
    return df_model_lr

In [40]:
col_names = ['S'+str(i) for i in xrange(1, 57)]
adjacency_list = compute_adjlist(27.)

In [44]:
df_model_lr = lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_IDWmodel) # 5.78
df_model_lr = lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr) # 5.56

In [45]:
create_submission_file(df_model_lr, 'models/lr_model_leo_v1.csv')