In [90]:
import warnings
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from scipy import stats
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVR

from pandas.plotting import lag_plot

import pyarrow
import fastparquet

import statsmodels

### Lectura del dataset 

In [224]:
df = pd.read_parquet('../notebooks_tp2/sin_codificar.parquet')
df

Unnamed: 0,dni_titular_movimiento,fecha_cupon_movimiento,moneda_movimiento,producto_naranja_movimiento,tipo_producto_tarjeta_movimiento,anio_mes_cupon,sexo_descripcion,monto_ajustado,cargo_sector_desc_hist,edad_cliente,antig_cliente,comercio_cat,estado_civil_cat,region
0,0001686b52949b5461ffcbc766687e45031,2020-08-25,0,PL,0,202008,Hombre,5.52,Sector_Empleado_Comercio,61.0,92,0,Otros,REGION PAMPEANA
1,000220fa96ec5af89817894033f8099c547,2020-08-25,0,PL,0,202008,Mujer,15.68,Sector_Sin_Datos,29.0,2,0,Sin_datos,REGION CUYO
2,0002be202de47dfae9cc2304d91161be595,2020-08-25,0,PL,0,202008,Mujer,5.46,Sector_Sin_Datos,28.0,95,0,Soltero,REGION PAMPEANA
3,000e137d0af42e193be1ff670c00d4d1506,2020-08-25,0,PL,0,202008,Hombre,2.50,Sector_Empleado_Comercio,40.0,151,0,Soltero,REGION PAMPEANA
4,0009d010e4faf69552a814a33832b185877,2020-08-25,0,PL,0,202008,Mujer,2.10,Sector_Empleado_Comercio,36.0,87,0,Soltero,REGION PAMPEANA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124309,0001f61dd2845a7e653ebfdaf22dab3b373,2021-05-14,0,PC,3,202105,Mujer,20329.24,Sector_Empleado_Comercio,48.0,98,0,Casado,REGION PATAGONIA
124310,0001f61dd2845a7e653ebfdaf22dab3b373,2021-05-14,0,PC,3,202105,Mujer,20329.24,Sector_Empleado_Comercio,48.0,98,0,Casado,REGION PATAGONIA
124311,0006c15ca823454b68c189da1344d9d7317,2021-05-14,0,PC,3,202105,Hombre,13210.67,Sector_No_Operativo,66.0,105,0,Casado,REGION NORDESTE
124312,0000ab27a0ed815f947df8bcb834ff97975,2021-05-03,0,PC,3,202105,Hombre,121.73,Sector_Empleado_Comercio,74.0,140,0,Soltero,REGION NOROESTE


In [225]:
df.loc[:,'fecha_cupon_movimiento'] = pd.to_datetime(df['fecha_cupon_movimiento']).astype(int)/10**11

In [226]:
df.loc[:, 'moneda_movimiento'] = df['moneda_movimiento'].astype(str)

In [227]:
df.dtypes

dni_titular_movimiento               object
fecha_cupon_movimiento              float64
moneda_movimiento                    object
producto_naranja_movimiento          object
tipo_producto_tarjeta_movimiento     object
anio_mes_cupon                        int64
sexo_descripcion                     object
monto_ajustado                      float64
cargo_sector_desc_hist               object
edad_cliente                        float64
antig_cliente                         int64
comercio_cat                         object
estado_civil_cat                     object
region                               object
dtype: object

In [228]:
ordinal_ft = 'dni_titular_movimiento'
target = 'monto_ajustado'

# Features numericas
num_features = ['anio_mes_cupon', 'edad_cliente', 'antig_cliente', 'fecha_cupon_movimiento']
# Features categoricas
cat_features = ['producto_naranja_movimiento', 'tipo_producto_tarjeta_movimiento', 'moneda_movimiento',
               'sexo_descripcion', 'cargo_sector_desc_hist', 'comercio_cat', 'estado_civil_cat', 'region']
# Features categoricas de la transaccion
trans_ft = ['producto_naranja_movimiento', 'tipo_producto_tarjeta_movimiento', 'moneda_movimiento', 'comercio_cat']
# Features categoricas del cliente
client_ft = ['sexo_descripcion', 'cargo_sector_desc_hist', 'estado_civil_cat', 'region']

In [180]:
df = df.sort_values(by = ['anio_mes_cupon'], ascending = True)
X = df.drop([ordinal_ft, target], axis=1)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=False)

In [196]:
#PIPELINE PARA REGRESION
# 1.codificar los features categoricos
# 2.Agrupar por fecha y sumar los features de la transaccion
# 3.Separar en train y validacion
# 4.Escalar los features numericos y los categoricos codificados de la transaccion

# el punto 2 sacrifica datos, usar si el modelo es pesado

df = df.sort_values(by = ['anio_mes_cupon'], ascending = True)
X = df.drop([ordinal_ft, target], axis=1)
y = df[target]

#cat_transformer = OneHotEncoder(handle_unknown='ignore')
cat_transformer = pd.get_dummies()

encoder = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features)])

X = encoder.fit_transform(X)

X.shape

In [241]:
df_cod = pd.get_dummies(df[num_features + trans_ft + client_ft])

In [243]:
df_cod[target] = df[target]
df_cod[ordinal_ft] = df[ordinal_ft]

In [240]:
mes_group = ['dni_titular_movimiento', 'anio_mes_cupon']
fecha_group = ['dni_titular_movimiento', 'fecha_cupon_movimiento']

df_mes = df_cod.groupby(cols_group).agg(aggr).reset_index() # edad y antiguedad

In [243]:
aggr = {'edad_cliente':'max', 'antig_cliente':'max'} 
aggr.update(dict.fromkeys(df.columns[5:26], 'max'))
aggr.update({'monto_ajustado':'sum'})
aggr.update(dict.fromkeys(df.columns[26:], 'sum'))
aggr

In [240]:
df_cod

Unnamed: 0,anio_mes_cupon,edad_cliente,antig_cliente,fecha_cupon_movimiento,producto_naranja_movimiento_AX,producto_naranja_movimiento_EX,producto_naranja_movimiento_MC,producto_naranja_movimiento_PC,producto_naranja_movimiento_PL,producto_naranja_movimiento_PN,...,estado_civil_cat_Sin_datos,estado_civil_cat_Soltero,region_REGION CUYO,region_REGION NORDESTE,region_REGION NOROESTE,region_REGION PAMPEANA,region_REGION PATAGONIA,region_SIN DATOS,dni_titular_movimiento,monto_ajustado
0,202008,61.0,92,15983136.0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0001686b52949b5461ffcbc766687e45031,5.52
1,202008,29.0,2,15983136.0,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,000220fa96ec5af89817894033f8099c547,15.68
2,202008,28.0,95,15983136.0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0002be202de47dfae9cc2304d91161be595,5.46
3,202008,40.0,151,15983136.0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,000e137d0af42e193be1ff670c00d4d1506,2.50
4,202008,36.0,87,15983136.0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0009d010e4faf69552a814a33832b185877,2.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124309,202105,48.0,98,16209504.0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0001f61dd2845a7e653ebfdaf22dab3b373,20329.24
124310,202105,48.0,98,16209504.0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0001f61dd2845a7e653ebfdaf22dab3b373,20329.24
124311,202105,66.0,105,16209504.0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0006c15ca823454b68c189da1344d9d7317,13210.67
124312,202105,74.0,140,16200000.0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0000ab27a0ed815f947df8bcb834ff97975,121.73


In [231]:
df_cod.columns

TypeError: _vhstack_dispatcher() takes 1 positional argument but 2 were given

In [185]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=False)

X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

In [186]:
X_train.shape

(79948, 50)

In [187]:
X_val.shape

(34264, 50)

In [60]:
num_transformer = MinMaxScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

In [239]:
aggr = {'edad_cliente':'max', 'antig_cliente':'max'} 
aggr.update(dict.fromkeys(df.columns[5:26], 'max'))
aggr.update({'monto_ajustado':'sum'})
aggr.update(dict.fromkeys(df.columns[26:], 'sum'))
aggr

{'edad_cliente': 'max',
 'antig_cliente': 'max',
 'sexo_descripcion=Hombre': 'max',
 'sexo_descripcion=Mujer': 'max',
 'sexo_descripcion=Sin Datos': 'max',
 'cargo_sector_desc_hist=Sector_Educativo': 'max',
 'cargo_sector_desc_hist=Sector_Empleado_Comercio': 'max',
 'cargo_sector_desc_hist=Sector_Financiero': 'max',
 'cargo_sector_desc_hist=Sector_No_Operativo': 'max',
 'cargo_sector_desc_hist=Sector_Operativo': 'max',
 'cargo_sector_desc_hist=Sector_Salud': 'max',
 'cargo_sector_desc_hist=Sector_Seguridad': 'max',
 'cargo_sector_desc_hist=Sector_Sin_Datos': 'max',
 'estado_civil_cat=Casado': 'max',
 'estado_civil_cat=Otros': 'max',
 'estado_civil_cat=Sin_datos': 'max',
 'estado_civil_cat=Soltero': 'max',
 'region=REGION CUYO': 'max',
 'region=REGION NORDESTE': 'max',
 'region=REGION NOROESTE': 'max',
 'region=REGION PAMPEANA': 'max',
 'region=REGION PATAGONIA': 'max',
 'region=SIN DATOS': 'max',
 'monto_ajustado': 'sum',
 'comercio_cat=0': 'sum',
 'comercio_cat=1': 'sum',
 'comercio_c

In [240]:
cols_group = ['dni_titular_movimiento', 'fecha_cupon_movimiento']

df_final_rg = df_final_cod.groupby(cols_group).agg(aggr).reset_index() # edad y antiguedad

In [72]:
xgb = XGBRegressor(random_state=0)

In [91]:
model = Pipeline([
  ('preprocessor', preprocessor),
  ('feature_selection', SelectFromModel(LinearSVR(random_state=0))),
  ('regressor', xgb)
])

In [92]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                                  ['anio_mes_cupon',
                                                   'edad_cliente',
                                                   'antig_cliente']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['producto_naranja_movimiento',
                                                   'tipo_producto_tarjeta_movimiento',
                                                   'sexo_descripcion',
                                                   'cargo_sector_desc_hist',
                                                   'comercio_cat',
                                                   'estado_civil_cat',
                                                   'region'])])),
                ('fe

In [93]:
# Predict
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_val)

In [94]:
# evaluate MSE:
train_error = mean_squared_error(y_train, y_train_pred)
val_error = mean_squared_error(y_val, y_valid_pred)
print(f'Train error {train_error.round(3)}, Valid error {val_error.round(3)}')  

Train error 13044569.069, Valid error 12966657.544


In [95]:
# evaluate RMSE:
train_error = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_error = np.sqrt(mean_squared_error(y_val, y_valid_pred))
print(f'Train error RMSE {train_error.round(3)}, Valid error RMSE {val_error.round(3)}')  

Train error RMSE 3611.727, Valid error RMSE 3600.925


In [96]:
# evaluate MAE:
train_error = mean_absolute_error(y_train, y_train_pred)
val_error = mean_absolute_error(y_val, y_valid_pred)
print(f'Train error MAE {train_error.round(3)}, Valid error MAE {val_error.round(3)}')  

Train error MAE 1502.347, Valid error MAE 1547.518


In [153]:
import datetime
 
def week_number_of_month(date_value):
     return (date_value.isocalendar()[1] - date_value.replace(day=1).isocalendar()[1] + 1)

In [155]:
semana = df['fecha_cupon_movimiento'].apply(week_number_of_month)

In [139]:
df.groupby(pd.Grouper(key='fecha_cupon_movimiento',freq='W')).agg(sum)
#weeks = [g for n, g in df.set_index('timestamp').groupby(pd.TimeGrouper('W'))]

Unnamed: 0_level_0,moneda_movimiento,anio_mes_cupon,monto_ajustado,edad_cliente,antig_cliente
fecha_cupon_movimiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-05,2,1212042,-8993.26,370.0,570
2020-07-12,0,2424084,-6046.6,566.0,1222
2020-07-19,4,2020070,1061.66,452.0,1164
2020-07-26,339,88883104,777796.43,21453.0,47849
2020-08-02,1412,335130070,4219464.84,83478.0,193827
2020-08-09,963,290891520,4829752.95,73038.0,160495
2020-08-16,1213,333717216,3884872.48,81498.0,177631
2020-08-23,989,711876192,2561579.99,179404.0,428166
2020-08-30,1395,985597132,3947880.69,240565.0,522487
2020-09-06,1246,288872724,4092190.26,71855.0,155263
