In [1]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import date

In [2]:
from sklearn.metrics import mean_absolute_error

In [3]:
from sklearn.metrics import mean_squared_error

In [4]:
from sklearn.externals import joblib

In [5]:
# Load Train Data
train = pd.read_csv('../data/processed/train_norm.csv',sep=';')

In [6]:
train.shape

(34540, 36)

In [7]:
train['fecha_venta_norm'] = pd.to_datetime(train['fecha_venta_norm'])

In [8]:
train['fecha_venta_norm'] = train['fecha_venta_norm'].dt.date

In [9]:
# Filtramos los meses que consideramos buenos para el entrenamiento (11 y 12)
train = train[train.fecha_venta_norm.isin([#date(2012, 11, 1),
                                                 date(2012, 12, 1),
                                                 date(2013, 11, 1), 
                                                 date(2013, 12, 1), 
                                                 date(2014, 11, 1)
])]

In [10]:
train.shape

(29168, 36)

In [11]:
train.head()

Unnamed: 0,id_pos,fecha_venta_norm,canal,unidades,competidores,ingreso_mediana,ingreso_promedio,densidad_poblacional,pct_0a5,pct_5a9,...,pct_bachelors,pct_doctorados,pct_secundario,pct_master,pct_bicicleta,pct_omnibus,pct_subtes,pct_taxi,pct_caminata,mediana_valor_hogar
0,3142,2012-12-01,ALMACEN,12,-0.426247,-0.351729,-0.226899,0.897103,0.829056,-0.003238,...,-0.05319,0.106553,-0.166587,-0.059822,-0.560318,2.549626,-0.072296,0.698023,0.187681,0.462005
1,3142,2013-12-01,ALMACEN,7,-0.426247,-0.351729,-0.226899,0.897103,0.829056,-0.003238,...,-0.05319,0.106553,-0.166587,-0.059822,-0.560318,2.549626,-0.072296,0.698023,0.187681,0.462005
2,3143,2013-11-01,ALMACEN,2,0.354214,0.05063,-0.051683,2.805211,0.324112,-0.225755,...,-0.364825,-0.41139,0.156037,-0.003583,-0.355545,2.549626,3.413813,0.899122,-0.130961,2.016187
3,3143,2013-12-01,ALMACEN,47,0.354214,0.05063,-0.051683,2.805211,0.324112,-0.225755,...,-0.364825,-0.41139,0.156037,-0.003583,-0.355545,2.549626,3.413813,0.899122,-0.130961,2.016187
5,3144,2012-12-01,ALMACEN,41,0.540038,-0.927639,-0.871341,-0.150302,1.326987,1.213427,...,-0.480503,0.770303,-0.080324,-0.352496,-0.707499,-0.314731,-0.471935,-0.542084,-0.733816,-0.812549


In [12]:
X = train.copy()

In [13]:
y = train.unidades

In [14]:
X.drop('unidades', inplace=True, axis=1)

In [15]:
# fill Nans
for c in X.columns[X.dtypes != 'object']:
    X[c].fillna(X[c].median(), inplace=True)

In [16]:
# Dummy
canal_dummy = pd.get_dummies(X['canal'])

In [17]:
X = pd.merge(X, canal_dummy, how='inner', left_index=True, right_index=True)

In [18]:
X.drop('canal', axis=1, inplace=True)

In [19]:
predictors = [ 'id_pos', 
       'ingreso_mediana', 'densidad_poblacional',
        'pct_5a9', 'pct_10a14', 'pct_15a19', 'pct_20a24',
        'pct_30a34',  'pct_40a44', 'pct_45a49',
       'pct_50a54',  'pct_60a64', 'pct_65a69', 'pct_70a74',
       'pct_75a79', 'pct_80a84', 'pct_85ainf', 
         'pct_master', 'pct_subtes',  'pct_caminata',
       'mediana_valor_hogar' ,'ALMACEN', 'MAXIKIOSCO']

In [20]:
X = X[predictors]

#### Training Linear Regression

In [21]:
model = LinearRegression(n_jobs=4).fit(X, y)


In [22]:
r_sq = model.score(X, y)
print('coefficient of determination:', r_sq)

print('intercept:', model.intercept_)

print('slope:', model.coef_)

coefficient of determination: 0.10164873485713799
intercept: 11.913686776427651
slope: [-1.32829555e-03 -1.57397480e-01  5.76721955e-02 -4.25621831e-01
  2.41147725e-01  2.66675155e-01 -3.75264097e-01  3.63261499e-01
 -2.76644207e-01  1.97844812e-01 -2.52153378e-01  3.29153181e-01
 -4.52864105e-01  6.01455201e-01  3.68379899e-01 -7.33143214e-01
  2.49743117e-01  4.41591124e-01  1.51233095e-01 -4.40789246e-01
 -3.37807901e-01  2.06133265e+01  6.52385488e+00]


In [23]:
y_pred = model.predict(X)

In [24]:
mean_absolute_error(y, y_pred)

15.636888295004027

In [25]:
mean_squared_error(y, y_pred)

473.36304057859724

In [26]:
np.median(y_pred)

21.668072344111337

In [27]:
y_pred_fixed = np.where(y_pred < 0, 0, y_pred)

In [28]:
mean_absolute_error(y, y_pred_fixed)

15.387374068810436

In [29]:
np.median(y_pred_fixed)

21.668072344111337

In [30]:
joblib.dump(model, '../models/mlr_002.pkl') 

['../models/mlr_002.pkl']