In [44]:
import pandas as pd
import numpy as np

In [60]:
df_resto = pd.read_csv('data/casc-resto.csv', sep=';', decimal=',')

In [46]:
df_dem = pd.read_csv('data/CASC_Constant.csv')

In [65]:
df_date = df_resto.groupby('CustomerID').apply(lambda x : pd.Series(dict(Y = x.RKDate.max() >= '2017-07-01')))

In [66]:
df_date

Unnamed: 0_level_0,Y
CustomerID,Unnamed: 1_level_1
2728046,False
2728088,False
2728089,True
2728095,False
2728107,True
...,...
2913103,False
2913106,False
2913114,False
2913123,True


In [82]:
df_agg = df_resto[df_resto['RKDate'] < '2017-07-01']

In [87]:
df_agg = df_agg.groupby('CustomerID').apply(get_agg)

In [86]:
from datetime import datetime

def get_agg(tmp):
    date = datetime.strptime('2017-07-01', '%Y-%m-%d')
    tmp['Date'] = pd.to_datetime(tmp['RKDate'], format='%Y-%m-%d')
    recency = (date - tmp['Date'].max()).days
    tmp['Day'] = tmp['Date'].dt.day
    tmp['Month'] = pd.to_datetime(tmp['Date']).dt.to_period('M')
    frequency = tmp['Day'].count() / tmp['Month'].nunique()
    monetary_value = tmp['SummAfterPointsUsage'].mean()
    tmp['DayOfWeek'] = tmp['Date'].dt.dayofweek
    dayofweek = tmp['DayOfWeek'].value_counts().idxmax()
    regions = tmp.groupby('RegionName')['RegionName'].count()
    high_region = regions[regions >= 10000]
    low_region = regions[regions < 1000]
    tmp['Region'] = list(map(lambda x : 0 if x in high_region else (1 if x in low_region else 2), tmp['RegionName']))
    region = tmp['Region'].max()
    return pd.Series(dict(Recency = recency, Frequency = frequency, Monetary_Value = monetary_value, DayOfWeek = dayofweek, Region = region))

In [77]:
df_agg.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary_Value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2728046,160.0,8.142857,251.789474
2728088,720.0,17.0,279.647059
2728089,15.0,6.6,398.734848
2728095,177.0,9.0,262.737374
2728107,115.0,5.25,393.714286


In [88]:
df_all = pd.merge(left=df_date, right=df_agg, left_on='CustomerID', right_on='CustomerID')

In [92]:
df_all = pd.merge(left=df_all, right=df_dem, left_on='CustomerID', right_on='CustomerId')
df_all['Sex'] = 0 if 'Male' else 1
df_all['Sex'] = df_all['Sex'].fillna(2)
df_all['Age'] = df_all['Age'].fillna(df_all['Age'].mean())

In [93]:
df_all.head()

Unnamed: 0,Y,Recency,Frequency,Monetary_Value,DayOfWeek,Region,CustomerId,ActivationDate,Age,Sex,SubscribedEmail,SubscribedPush
0,False,160.0,8.142857,251.789474,3.0,1.0,2728046,2015-01-01,24.0,0,False,True
1,False,720.0,17.0,279.647059,3.0,1.0,2728088,2015-01-01,46.0,0,True,True
2,True,15.0,6.6,398.734848,3.0,1.0,2728089,2015-01-01,27.0,0,True,True
3,False,177.0,9.0,262.737374,3.0,1.0,2728095,2015-01-01,54.0,0,True,True
4,True,115.0,5.25,393.714286,3.0,1.0,2728107,2015-01-01,48.0,0,False,False


In [95]:
X = df_all[['Recency', 'Frequency', 'Monetary_Value', 'DayOfWeek', 'Region', 'Age', 'Sex']]
y = df_all['Y']

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
reg = LogisticRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)



In [97]:
from sklearn.metrics import recall_score, precision_score

print('Score: ', reg.score(X_test, y_test))
print('Precision: ', precision_score(y_pred, y_test))
print('Recall: ', recall_score(y_pred, y_test))

Score:  0.7379275653923542
Precision:  0.8099352051835853
Recall:  0.684931506849315
