# Лабораторная 1

## Загружаем данные

In [8]:
import pandas as pd
import numpy as np

In [9]:
df_resto = pd.read_csv('data/casc-resto.csv', sep=';', decimal=',')

In [10]:
df_dem = pd.read_csv('data/CASC_Constant.csv')

In [11]:
df_resto.describe()

Unnamed: 0,CustomerID,Restaurant,Quantity,SummBasic,SummAfterPointsUsage
count,882222.0,882222.0,882222.0,882222.0,882222.0
mean,2809199.0,468.268804,1.1549,287.725795,260.255589
std,61784.2,279.170967,0.650287,264.555291,251.289517
min,2728046.0,40.0,0.0,0.5,-2593.0
25%,2754886.0,333.0,1.0,120.0,110.0
50%,2785104.0,434.0,1.0,225.0,199.0
75%,2878900.0,712.0,1.0,379.0,348.94
max,2913132.0,980.0,107.0,29450.0,29450.0


In [12]:
df_dem.describe()

Unnamed: 0,CustomerId,Age
count,10000.0,9953.0
mean,2812482.0,32.894203
std,62276.32,11.954687
min,2728046.0,16.0
25%,2756356.0,26.0
50%,2794998.0,31.0
75%,2880618.0,37.0
max,2913132.0,247.0


## Чистим данные

In [16]:
df_resto = df_resto[df_resto['SummAfterPointsUsage'] > 0]

In [17]:
df_dem['Age'] = list(map(lambda x : df_dem['Age'].mean() if x > 100 else x, df_dem['Age'])) 

## Обрабатываем данные

### Формируем целевую переменную

In [18]:
df_date = df_resto.groupby('CustomerID').apply(lambda x : pd.Series(dict(Y = x.RKDate.max() >= '2017-07-01')))

### Фильтруем записи

In [19]:
df_agg = df_resto[df_resto['RKDate'] < '2017-07-01']

### Получаем агрегированные данные

In [20]:
from datetime import datetime

def get_agg(tmp):
    date = datetime.strptime('2017-07-01', '%Y-%m-%d')
    tmp['Date'] = pd.to_datetime(tmp['RKDate'], format='%Y-%m-%d')
    recency = (date - tmp['Date'].max()).days
    tmp['Day'] = tmp['Date'].dt.day
    tmp['Month'] = pd.to_datetime(tmp['Date']).dt.to_period('M')
    frequency = tmp['Day'].count() / tmp['Month'].nunique()
    monetary_value = tmp['SummAfterPointsUsage'].mean()
    # Выделяем наиболее часто посещаемый день недели.
    tmp['DayOfWeek'] = tmp['Date'].dt.dayofweek
    dayofweek = tmp['DayOfWeek'].value_counts().idxmax()
    # Кодируем регионы.
    tmp['Region'] = list(map(lambda x : 0 if x == 'Москва и Московская область' else 1, tmp['RegionName']))
    region = tmp['Region'].max()
    return pd.Series(dict(Recency = recency, Frequency = frequency, Monetary_Value = monetary_value, DayOfWeek = dayofweek, Region = region))

In [21]:
df_agg = df_agg.groupby('CustomerID').apply(get_agg)

In [22]:
df_agg.describe()

Unnamed: 0,Recency,Frequency,Monetary_Value,DayOfWeek,Region
count,9938.0,9938.0,9938.0,9938.0,9938.0
mean,259.387201,8.389211,292.692648,3.63876,1.0
std,254.477647,5.79738,100.690789,1.904195,0.0
min,1.0,1.0,69.794811,0.0,1.0
25%,42.0,5.285714,231.172078,2.0,1.0
50%,158.0,7.0,289.312627,4.0,1.0
75%,438.0,9.577068,347.028846,5.0,1.0
max,912.0,84.962963,1429.63121,6.0,1.0


## Объединяем данные

In [23]:
df_all = pd.merge(left=df_date, right=df_agg, left_on='CustomerID', right_on='CustomerID')

In [24]:
df_all = pd.merge(left=df_all, right=df_dem, left_on='CustomerID', right_on='CustomerId')
df_all['Sex'] = list(map(lambda x : 0 if x == 'Male' else (1 if x == 'Female' else 2), df_all['Sex']))
df_all['Age'] = df_all['Age'].fillna(df_all['Age'].mean())

In [25]:
df_all.describe()

Unnamed: 0,Recency,Frequency,Monetary_Value,DayOfWeek,Region,CustomerId,Age,Sex
count,9938.0,9938.0,9938.0,9938.0,9938.0,9938.0,9938.0,9938.0
mean,259.387201,8.389211,292.692648,3.63876,1.0,2812421.0,32.091205,0.739787
std,254.477647,5.79738,100.690789,1.904195,0.0,62315.1,8.050908,0.647107
min,1.0,1.0,69.794811,0.0,1.0,2728046.0,16.0,0.0
25%,42.0,5.285714,231.172078,2.0,1.0,2756314.0,26.0,0.0
50%,158.0,7.0,289.312627,4.0,1.0,2794642.0,31.0,1.0
75%,438.0,9.577068,347.028846,5.0,1.0,2880642.0,36.0,1.0
max,912.0,84.962963,1429.63121,6.0,1.0,2913132.0,96.0,2.0


## Обучаем модель

### Выделяем нужные признаки

In [26]:
X = df_all[['Recency', 'Frequency', 'Monetary_Value', 'DayOfWeek', 'Region', 'Age', 'Sex']]
y = df_all['Y']

### Обучаем логистическую модель

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
reg = LogisticRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)



### Оцениваем полученную модель

In [28]:
from sklearn.metrics import recall_score, precision_score

print('Score: ', reg.score(X_test, y_test))
print('Precision: ', precision_score(y_pred, y_test))
print('Recall: ', recall_score(y_pred, y_test))

Score:  0.7298792756539235
Precision:  0.829004329004329
Recall:  0.6689956331877729
