In [1]:
import re

import pandas as pd

In [2]:
train = pd.read_csv('data/hackaton2023_train.csv')
test = pd.read_csv('data/hackaton2023_test.csv')

In [3]:
train['id_restoran'] = train.agg('{0[format_name]} {0[ownareaall_sqm]}'.format, axis=1)
test['id_restoran'] = test.agg('{0[format_name]} {0[ownareaall_sqm]}'.format, axis=1)

In [6]:
tr_df = train[['customer_id', 'startdatetime', 'dish_name', 'revenue', 'id_restoran']]
ts_df = test[['customer_id', 'startdatetime', 'dish_name', 'revenue', 'id_restoran']]

data_cheki = pd.concat([tr_df, ts_df])

data_cheki['startdatetime'] = pd.to_datetime(data_cheki['startdatetime'])

data_cheki['startdate'] = data_cheki['startdatetime'].dt.date

In [7]:
groupby_data_cheki = data_cheki.groupby(['startdate', 'id_restoran', 'customer_id']).agg(
    {'revenue': ['median', 'sum', 'count']}).reset_index()

groupby_data_cheki.columns = ['date', 'id_restoran', 'customer_id', 'median', 'sum', 'count']

In [8]:
quantiles = groupby_data_cheki[['median', 'sum', 'count']].quantile(q=[0.25, 0.5, 0.75])
quantiles.head()

Unnamed: 0,median,sum,count
0.25,59.99,250.99,2.0
0.5,90.485,405.99,4.0
0.75,129.99,689.95,7.0


In [9]:
# Converting quantiles to a dictionary, easier to use.
quantiles = quantiles.to_dict()
##  RFM Segmentation ----
RFM_Segment = groupby_data_cheki.copy()


# Arguments (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def R_Class(x, p, d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]:
        return 2
    else:
        return 1


# Arguments (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def FM_Class(x, p, d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]:
        return 3
    else:
        return 4


RFM_Segment['R_Quartile'] = RFM_Segment['median'].apply(R_Class, args=('median', quantiles,))
RFM_Segment['F_Quartile'] = RFM_Segment['sum'].apply(FM_Class, args=('sum', quantiles,))
RFM_Segment['M_Quartile'] = RFM_Segment['count'].apply(FM_Class, args=('count', quantiles,))
RFM_Segment['RFMClass'] = RFM_Segment.R_Quartile.map(str) \
                          + RFM_Segment.F_Quartile.map(str) \
                          + RFM_Segment.M_Quartile.map(str)

In [10]:
RFM_Segment[RFM_Segment['customer_id']== 22449558] 

Unnamed: 0,date,id_restoran,customer_id,median,sum,count,R_Quartile,F_Quartile,M_Quartile,RFMClass
2812120,2023-07-02,Фудкорт без туалета 112.0,22449558,19.99,7729.6,190,4,4,4,444
2850376,2023-07-10,Отдельно стоящий без внешней зоны 300.0,22449558,225.0,1249.96,4,1,4,2,142
2880766,2023-07-16,Отдельно стоящий без внешней зоны 300.0,22449558,239.98,239.98,1,1,1,1,111
2935600,2023-07-27,Отдельно стоящий без внешней зоны 300.0,22449558,225.0,889.96,4,1,4,2,142


In [11]:
RFM_Segment.to_csv(r'RFM_Segment.csv', index=False)