In [54]:
import tools
import datetime
import numpy as np




In [156]:
df = tools.load_dataframe()

In [3]:
df.head()

Unnamed: 0,DAY,HOUR,OPCAT_CATEGORY,SIZE,SUBBRANCH_ID
0,2014-10-25,16,SM,26,1005609
1,2014-04-14,11,SM,21,1005625
2,2014-10-25,11,SM,36,1005609
3,2015-08-14,12,CM,175,1006034
4,2014-04-14,12,SM,29,1005625


# Baseline: predict average value

In [161]:
dates = [
    datetime.datetime.strptime("01.03.2015", "%d.%m.%Y").date(),
    datetime.datetime.strptime("01.04.2015", "%d.%m.%Y").date(),
    datetime.datetime.strptime("01.08.2015", "%d.%m.%Y").date(),
    datetime.datetime.strptime("01.09.2015", "%d.%m.%Y").date(),
    datetime.datetime.strptime("01.10.2015", "%d.%m.%Y").date(),
]
def filter_standart_work_hours(df, start_time=8, end_time=19):
    return df[(df["HOUR"] >= start_time) & ((df["HOUR"] <= end_time))]

def filter_category(df, category):
    return df[(df["OPCAT_CATEGORY"] == category)]

def make_train_test(df, bounder_data, acc, delta=datetime.timedelta(days=31)):    
    train = df[df["DAY"] < bounder_data]
    test = df[(df["DAY"] > bounder_data) & (df["DAY"] > bounder_data + delta)]

    train = filter_standart_work_hours(train)
    test = filter_standart_work_hours(test)
    
    for category in ["SM", "CM"]:
        
        train_ = filter_category(train, category)
        test_ = filter_category(test, category)
        
        train_model = train_.groupby(["SUBBRANCH_ID", "HOUR"])["SIZE"].mean()
        test_model = test_.groupby(["SUBBRANCH_ID", "HOUR"])["SIZE"].mean()
        acc_ = ((train_model - test_model)**2).mean()
        acc[category].append(acc_)
    return acc

acc = {
    'SM' : [],
    'CM' : [],
}
for data in dates:
    make_train_test(df, data, acc)
print acc
print 'SM', np.array(acc['SM']).mean(), np.array(acc['SM']).std()
print 'CM', np.array(acc['CM']).mean(), np.array(acc['CM']).std()

{'CM': [170.2016107967591, 171.09478338916668, 147.26514349942343, 157.77679956832114, 166.43012971757324], 'SM': [28.927186323232775, 26.670184082414401, 12.396111273763523, 12.87045128304009, 11.701012612561552]}
SM 18.512989115 7.62433977218
CM 162.553693394 8.97804087735


# Baseline: predict average without new year holidays

In [162]:
newyear_2014 = [
    datetime.datetime.strptime("15.12.2014", "%d.%m.%Y").date(),
    datetime.datetime.strptime("15.01.2015", "%d.%m.%Y").date()
]
newyear_2015 =  [
    datetime.datetime.strptime("15.12.2015", "%d.%m.%Y").date(),
    datetime.datetime.strptime("12.01.2016", "%d.%m.%Y").date()
]

In [163]:
def remove_interval(df, holidays):
    return df[(df["DAY"] < holidays[0]) | (df["DAY"] > holidays[1])]

df_ordinal = remove_interval(df, newyear_2014)
df_ordinal = remove_interval(df_ordinal, newyear_2015)

acc = {
    'SM' : [],
    'CM' : [],
}
for data in dates:
    make_train_test(df_ordinal, data, acc)
print acc
print 'SM', np.array(acc['SM']).mean(), np.array(acc['SM']).std()
print 'CM', np.array(acc['CM']).mean(), np.array(acc['CM']).std()

{'CM': [168.50155010822337, 167.79878331851862, 140.89994313931282, 146.76465133036211, 143.43209413721539], 'SM': [30.569068920005886, 28.501864873757352, 13.820398619673984, 14.647363940410941, 13.899756295377438]}
SM 20.2876905298 7.58451388323
CM 153.479404407 12.1242622225


# Linear solution

In [191]:
from sklearn import linear_model

def test_linear_solution(df, bounder_data, acc, delta=datetime.timedelta(days=31)):    
    train = df[df["DAY"] < bounder_data]
    test = df[(df["DAY"] > bounder_data) & (df["DAY"] > bounder_data + delta)]

    train = filter_standart_work_hours(train)
    test = filter_standart_work_hours(test)

    for category in ["SM", "CM"]:
        train_ = filter_category(train, category)
        test_ = filter_category(test, category)
        
        train_model = train_.groupby(["SUBBRANCH_ID", "HOUR"])
        test_model = test_.groupby(["SUBBRANCH_ID", "HOUR"])["SIZE"].mean()
        
        errors = []
        
        for group in train_model.groups.keys():
            x = train_model.get_group(group)["DAY"]
            y = train_model.get_group(group)["SIZE"]
            timestamps = np.array([int(x_.strftime("%s")) for x_ in x.values])
            reg = linear_model.LinearRegression(copy_X=True)
            reg.fit(timestamps.reshape((len(timestamps)), 1), y.values)
            prediction = reg.predict(int((bounder_data+delta/2).strftime("%s")))[0]
            errors.append(prediction - test_model[group[0]][group[1]])
            
        acc_ = (np.array(errors)**2).mean()
        acc[category].append(acc_)
    return acc

In [192]:
df_ordinal = remove_interval(df, newyear_2014)
df_ordinal = remove_interval(df_ordinal, newyear_2015)

acc = {
    'SM' : [],
    'CM' : [],
}
for data in dates:
    test_linear_solution(df_ordinal, data, acc)
print acc
print 'SM', np.array(acc['SM']).mean(), np.array(acc['SM']).std()
print 'CM', np.array(acc['CM']).mean(), np.array(acc['CM']).std()

{'CM': [326.19279588293244, 371.88835099204567, 186.19976973307223, 167.54088522361718, 181.07338765633079], 'SM': [17.866294335031018, 11.532559194670961, 5.2217357432548601, 8.2885844373642303, 8.7684745105031912]}
SM 10.3355296442 4.26478208368
CM 246.579037898 85.1169198509
