In [90]:
import warnings
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from scipy import stats
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVR

from pandas.plotting import lag_plot

import pyarrow
import fastparquet

import statsmodels

### Lectura del dataset 

In [224]:
df = pd.read_parquet('../notebooks_tp2/sin_codificar.parquet')
df

Unnamed: 0,dni_titular_movimiento,fecha_cupon_movimiento,moneda_movimiento,producto_naranja_movimiento,tipo_producto_tarjeta_movimiento,anio_mes_cupon,sexo_descripcion,monto_ajustado,cargo_sector_desc_hist,edad_cliente,antig_cliente,comercio_cat,estado_civil_cat,region
0,0001686b52949b5461ffcbc766687e45031,2020-08-25,0,PL,0,202008,Hombre,5.52,Sector_Empleado_Comercio,61.0,92,0,Otros,REGION PAMPEANA
1,000220fa96ec5af89817894033f8099c547,2020-08-25,0,PL,0,202008,Mujer,15.68,Sector_Sin_Datos,29.0,2,0,Sin_datos,REGION CUYO
2,0002be202de47dfae9cc2304d91161be595,2020-08-25,0,PL,0,202008,Mujer,5.46,Sector_Sin_Datos,28.0,95,0,Soltero,REGION PAMPEANA
3,000e137d0af42e193be1ff670c00d4d1506,2020-08-25,0,PL,0,202008,Hombre,2.50,Sector_Empleado_Comercio,40.0,151,0,Soltero,REGION PAMPEANA
4,0009d010e4faf69552a814a33832b185877,2020-08-25,0,PL,0,202008,Mujer,2.10,Sector_Empleado_Comercio,36.0,87,0,Soltero,REGION PAMPEANA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124309,0001f61dd2845a7e653ebfdaf22dab3b373,2021-05-14,0,PC,3,202105,Mujer,20329.24,Sector_Empleado_Comercio,48.0,98,0,Casado,REGION PATAGONIA
124310,0001f61dd2845a7e653ebfdaf22dab3b373,2021-05-14,0,PC,3,202105,Mujer,20329.24,Sector_Empleado_Comercio,48.0,98,0,Casado,REGION PATAGONIA
124311,0006c15ca823454b68c189da1344d9d7317,2021-05-14,0,PC,3,202105,Hombre,13210.67,Sector_No_Operativo,66.0,105,0,Casado,REGION NORDESTE
124312,0000ab27a0ed815f947df8bcb834ff97975,2021-05-03,0,PC,3,202105,Hombre,121.73,Sector_Empleado_Comercio,74.0,140,0,Soltero,REGION NOROESTE


In [225]:
df.loc[:,'fecha_cupon_movimiento'] = pd.to_datetime(df['fecha_cupon_movimiento']).astype(int)/10**11

In [226]:
df.loc[:, 'moneda_movimiento'] = df['moneda_movimiento'].astype(str)

In [227]:
df.dtypes

dni_titular_movimiento               object
fecha_cupon_movimiento              float64
moneda_movimiento                    object
producto_naranja_movimiento          object
tipo_producto_tarjeta_movimiento     object
anio_mes_cupon                        int64
sexo_descripcion                     object
monto_ajustado                      float64
cargo_sector_desc_hist               object
edad_cliente                        float64
antig_cliente                         int64
comercio_cat                         object
estado_civil_cat                     object
region                               object
dtype: object

In [228]:
ordinal_ft = 'dni_titular_movimiento'
target = 'monto_ajustado'

# Features numericas
num_features = ['anio_mes_cupon', 'edad_cliente', 'antig_cliente', 'fecha_cupon_movimiento']

# Features categoricas de la transaccion
trans_ft = ['producto_naranja_movimiento', 'tipo_producto_tarjeta_movimiento', 'moneda_movimiento', 'comercio_cat']

# Features categoricas del cliente
client_ft = ['sexo_descripcion', 'cargo_sector_desc_hist', 'estado_civil_cat', 'region']

In [312]:
df = df.sort_values(by = ['anio_mes_cupon'], ascending = True)

cat_transformer = OneHotEncoder(handle_unknown='ignore')

encoder = ColumnTransformer(
    transformers=[
        ('dni', 'drop', [ordinal_ft]),
        ('num', 'passthrough', num_features),
        ('trans', cat_transformer, trans_ft),
        ('client', cat_transformer, client_ft),
        ('target', 'passthrough', [target])])
df_enc = encoder.fit_transform(df)
cols = encoder.get_feature_names()

df_enc.shape

(114212, 55)

In [313]:
df_enc = pd.DataFrame.sparse.from_spmatrix(df_enc, columns=[cols]).sparse.to_dense()

In [333]:
# Agregar feature ordinal al principio del dataframe
cols_o = np.hstack([[ordinal_ft],cols])
df_stack = np.hstack([df[[ordinal_ft]],df_enc])
df_encode = pd.DataFrame(df_stack , columns=cols_o)

# Funciones de agregacion para cada columna
aggr = {} 
aggr.update(dict.fromkeys([x for x in cols if 'client' in x], 'max'))
aggr.update(dict.fromkeys([x for x in cols if 'trans' in x], 'sum'))
aggr.update({target:'sum'})

In [334]:
# Agrupamiento por mes
group = ['dni_titular_movimiento', 'anio_mes_cupon']

df_mes = df_encode.groupby(group).agg(aggr).reset_index() # edad y antiguedad

In [None]:
# Agrupamiento por fecha
group = ['dni_titular_movimiento', 'fecha_cupon_movimiento']

df_fecha = df_encode.groupby(group).agg(aggr).reset_index() # edad y antiguedad

In [352]:
df_encode.values

array([['000821b0a8bdb0450e97298899adc5a3652', 202007.0, 36.0, ..., 0.0,
        0.0, 28.99],
       ['000bebf8a8a59d314b49994ebd84b107414', 202007.0, 58.0, ..., 0.0,
        0.0, 1176.63],
       ['000821b0a8bdb0450e97298899adc5a3652', 202007.0, 36.0, ..., 0.0,
        0.0, 87.91],
       ...,
       ['0001b49c252907291ca16195773ef104079', 202105.0, 49.0, ..., 0.0,
        0.0, 82.64],
       ['0007d3573a085abc49ede83321981a7d760', 202106.0, 50.0, ..., 0.0,
        0.0, -9.0],
       ['0007d3573a085abc49ede83321981a7d760', 202106.0, 50.0, ..., 0.0,
        0.0, -9.0]], dtype=object)

In [340]:
df_mes.describe()

Unnamed: 0,anio_mes_cupon,edad_cliente,antig_cliente,client__x0_Hombre,client__x0_Mujer,client__x0_Sin Datos,client__x1_Sector_Educativo,client__x1_Sector_Empleado_Comercio,client__x1_Sector_Financiero,client__x1_Sector_No_Operativo,...,trans__x3_1,trans__x3_2,trans__x3_3,trans__x3_4,trans__x3_5,trans__x3_6,trans__x3_7,trans__x3_8,trans__x3_9,monto_ajustado
count,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,...,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0,7443.0
mean,202052.953245,49.149536,107.739755,0.460836,0.533118,0.006046,0.058444,0.193202,0.011017,0.084106,...,3.602177,0.547226,0.324735,0.022975,1.703345,0.235658,0.046621,0.099019,2.288728,24197.194262
std,46.437423,14.954749,76.457279,0.498497,0.498935,0.077526,0.234597,0.394836,0.104389,0.277565,...,6.045837,1.874314,1.924541,0.336978,3.562076,1.68361,0.628194,0.92967,5.939887,31009.2238
min,202007.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-27864.76
25%,202010.0,36.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2029.27
50%,202012.0,47.0,93.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12700.54
75%,202103.0,61.0,156.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,35199.635
max,202106.0,85.0,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,72.0,36.0,34.0,14.0,112.0,81.0,28.0,25.0,126.0,300740.42


In [341]:
df_fecha.describe()

Unnamed: 0,fecha_cupon_movimiento,edad_cliente,antig_cliente,client__x0_Hombre,client__x0_Mujer,client__x0_Sin Datos,client__x1_Sector_Educativo,client__x1_Sector_Empleado_Comercio,client__x1_Sector_Financiero,client__x1_Sector_No_Operativo,...,trans__x3_1,trans__x3_2,trans__x3_3,trans__x3_4,trans__x3_5,trans__x3_6,trans__x3_7,trans__x3_8,trans__x3_9,monto_ajustado
count,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,...,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0,31981.0
mean,16081040.0,49.380664,112.753604,0.466527,0.527688,0.005785,0.073606,0.207592,0.016791,0.079766,...,0.838342,0.127357,0.075576,0.005347,0.396423,0.054845,0.01085,0.023045,0.53266,5631.459832
std,69416.02,14.749501,78.827104,0.498886,0.499241,0.075838,0.261133,0.405589,0.12849,0.270935,...,1.886742,0.621691,0.512762,0.115156,1.036756,0.415228,0.176587,0.255766,1.164827,11325.153935
min,15937340.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-39645.54
25%,16021150.0,37.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,566.06
50%,16085090.0,47.0,98.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1670.84
75%,16142110.0,61.0,170.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5502.68
max,16219010.0,85.0,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,35.0,16.0,14.0,5.0,18.0,16.0,10.0,10.0,14.0,180632.22


In [386]:
dff = df_encode.copy()

In [387]:
X = dff.drop([ordinal_ft, target], axis=1)
y = dff[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=False)

num_transformer = MinMaxScaler()
cat_transformer = 'passthrough'

num_features = [g for g in X.columns if 'client' not in g]
cat_features = [g for g in X.columns if 'client' in g]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

xgb = XGBRegressor(random_state=0)
model = Pipeline([
  ('preprocessor', preprocessor),
  ('feature_selection', SelectFromModel(LinearSVR(random_state=0))),
  ('regressor', xgb)
])

model.fit(X_train, y_train)

# Predict
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_val)



In [388]:
# evaluate MSE:
train_error = mean_squared_error(y_train, y_train_pred)
val_error = mean_squared_error(y_val, y_valid_pred)
print(f'Train error {train_error.round(3)}, Valid error {val_error.round(3)}')  

Train error 12956791.647, Valid error 12446406.983


In [389]:
# evaluate RMSE:
train_error = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_error = np.sqrt(mean_squared_error(y_val, y_valid_pred))
print(f'Train error RMSE {train_error.round(3)}, Valid error RMSE {val_error.round(3)}')  

Train error RMSE 3599.554, Valid error RMSE 3527.947


In [390]:
# evaluate MAE:
train_error = mean_absolute_error(y_train, y_train_pred)
val_error = mean_absolute_error(y_val, y_valid_pred)
print(f'Train error MAE {train_error.round(3)}, Valid error MAE {val_error.round(3)}')  

Train error MAE 1459.115, Valid error MAE 1485.406


In [196]:
#PIPELINE PARA REGRESION
# 1.codificar los features categoricos
# 2.Agrupar por fecha y sumar los features de la transaccion
# 3.Separar en train y validacion
# 4.Escalar los features numericos y los categoricos codificados de la transaccion

# el punto 2 sacrifica datos, usar si el modelo es pesado

In [153]:
import datetime
 
def week_number_of_month(date_value):
     return (date_value.isocalendar()[1] - date_value.replace(day=1).isocalendar()[1] + 1)

In [155]:
semana = df['fecha_cupon_movimiento'].apply(week_number_of_month)

In [139]:
df.groupby(pd.Grouper(key='fecha_cupon_movimiento',freq='W')).agg(sum)
#weeks = [g for n, g in df.set_index('timestamp').groupby(pd.TimeGrouper('W'))]

Unnamed: 0_level_0,moneda_movimiento,anio_mes_cupon,monto_ajustado,edad_cliente,antig_cliente
fecha_cupon_movimiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-05,2,1212042,-8993.26,370.0,570
2020-07-12,0,2424084,-6046.6,566.0,1222
2020-07-19,4,2020070,1061.66,452.0,1164
2020-07-26,339,88883104,777796.43,21453.0,47849
2020-08-02,1412,335130070,4219464.84,83478.0,193827
2020-08-09,963,290891520,4829752.95,73038.0,160495
2020-08-16,1213,333717216,3884872.48,81498.0,177631
2020-08-23,989,711876192,2561579.99,179404.0,428166
2020-08-30,1395,985597132,3947880.69,240565.0,522487
2020-09-06,1246,288872724,4092190.26,71855.0,155263
