In [237]:
import numpy as np
from scipy.stats.mstats import mode
from copy import deepcopy
import pandas as pd
import time
%matplotlib inline
from helper import *
from IDWmodel import *

In [238]:
# ### Building a reference table with average daily value of the sensor
# specify with num_minutes over how many minutes the time should be bucketed
def build_avg_time_table(df_train, num_minutes=5):
    
    # different averaging does not seem to have an effect...
    df_train['day_time'] = (df_train['time'] % (100 * 100)) // 100 * 100 + ((df_train['time'] % 100) // num_minutes) * num_minutes
    
    # Initializing the dataframe
    # Update: rounding the value
    col_name = 'S1'
    df_day_avg_values = df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round)

    col_names = ['S'+str(i) for i in xrange(1, 57)]
    for col_name in col_names[1:]:
        df_day_avg_values = df_day_avg_values.join(df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round))
        
    return df_day_avg_values

In [239]:
df_train = load_train_data()
df_IDWmodel = pd.read_csv('data/IDWmodel_train.csv')
df_day_avg_values = build_avg_time_table(df_train)

In [246]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import linear_model


def prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model, prediction_model, do_rounding = False):
    # add staircase/restroom binary features
    # augment manually
    staircaseA_nodes = ['S42', 'S46']
    staircaseB_nodes = ['S34', 'S35']
    staircaseC_nodes = ['S52', 'S53']
    
    # Dataframe to store the model prediction
    df_model_lr = df_train.copy()
    
    for col in col_names:
        # X will store the features and the outcome Y
        X = df_train.copy()
        X = X.rename(columns={col:'Y'})
        X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
        X = X.rename(columns={col:col+'avg'})

        # Building the neighbors (from adjacency list) with missing values filled as in model
        neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
        X = X[['Y']].join(df_model[neighbors_col])
        
        # augment with staircase info
        X['sA'] = (col in staircaseA_nodes) * 1.
        X['sB'] = (col in staircaseB_nodes) * 1.
        X['sC'] = (col in staircaseC_nodes) * 1.
        

        X_train = X[X['Y'] != -1]
        X_test = X[X['Y'] == -1]
        test_indices = X[X['Y'] == -1].index
        col_values = X['Y']

        if len(X_test):
            # Models
            prediction_model = prediction_model.fit(X_train.drop('Y', axis=1), X_train.Y)
            col_values.ix[test_indices] = prediction_model.predict(X_test.drop('Y', axis=1))

            # Filling the result with the current sensor prediction
            if do_rounding:
                df_model_lr[col] = np.round(col_values)
            else:
                df_model_lr[col] = col_values
    return df_model_lr

In [243]:
col_names = ['S'+str(i) for i in xrange(1, 57)]
adjacency_list = compute_adjlist(27.)

In [244]:
clf = linear_model.LassoLarsCV(positive=True, max_iter=1500)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_IDWmodel, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf)
df_day_avg_values = build_avg_time_table(df_model_lr)
df_model_lr = prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr, clf, do_rounding = True)

ValueError: A value in x_new is below the interpolation range.

In [245]:
create_submission_file(df_model_lr, 'models/lr_model_leo_v7.csv')

In [252]:
def prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list, df_model, prediction_model, window_size=10, do_rounding = False):
    staircaseA_nodes = ['S42', 'S46']
    staircaseB_nodes = ['S34', 'S35']
    staircaseC_nodes = ['S52', 'S53']
    
    # Dataframe to store the model prediction
    df_model_lr = df_model.copy()
    
    # Building the moving sum for the features before/after for each neighbor
    model_curr_before = pd.rolling_sum(df_model.sort(ascending=False), window_size+1) - df_model
    model_curr_after = pd.rolling_sum(df_model, window_size+1) - df_model
    model_curr_before = model_curr_before.rename(columns={col:col+'before' for col in col_names})
    model_curr_after = model_curr_after.rename(columns={col:col+'after' for col in col_names})
    window_features = model_curr_after.join(model_curr_before[[col_+'before' for col_ in col_names]])
    
    for col in col_names:
        # X will store the features and the outcome Y
        X = df_train.copy()
        X = X.rename(columns={col:'Y'})
        X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
        X = X.rename(columns={col:col+'avg'})

        # Building the neighbors (from adjacency list) with missing values filled as in model
        neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
        
        X = X[['Y']].join(df_model[neighbors_col])
        X = X.join(window_features[[col_+'before' for col_ in neighbors_col] + [col_+'after' for col_ in neighbors_col]])
        # Removing the first and last element impossible to compute given the window_size
        X = X.sort()[window_size: - window_size]
        
        # augment with staircase info
        X['sA'] = (col in staircaseA_nodes) * 1.
        X['sB'] = (col in staircaseB_nodes) * 1.
        X['sC'] = (col in staircaseC_nodes) * 1.

        X_train = X[X['Y'] != -1]
        X_test = X[X['Y'] == -1]
        test_indices = X[X['Y'] == -1].index
        col_values = df_model_lr[col]

        if len(X_test):
            # Models
            prediction_model = prediction_model.fit(X_train.drop('Y', axis=1), X_train.Y)
            col_values.ix[test_indices] = prediction_model.predict(X_test.drop('Y', axis=1))

            # Filling the result with the current sensor prediction
            if do_rounding:
                df_model_lr[col] = np.round(col_values)
            else:
                df_model_lr[col] = col_values
    return df_model_lr

In [253]:
clf = linear_model.LassoLarsCV(positive=True, max_iter=1500)

#clf = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
#clf = linear_model.BayesianRidge()
#clf = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', linear_model.LassoLarsCV(fit_intercept=False))])
adjacency_list_leo_tuned = adjacency_list
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_IDWmodel, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10)
df_model_lr_v7 = prediction_augmented(df_train, col_names, df_day_avg_values, adjacency_list_leo_tuned, df_model_lr_v7, clf, window_size=10, do_rounding = True)



In [254]:
create_submission_file(df_model_lr_v7, 'models/lr_model_leo_v8.csv')

In [121]:
# plot how much data for each sensor is missing in %
for i in xrange(56):
    sid = 'S'+str(i+1)
    idxs = df[sid] == -1
    print sid + ' ' + str(np.sum(idxs * 1.) / (1. * df.count()[0]))