In [118]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
import time
import datetime
from sklearn import preprocessing
from sklearn import cluster, metrics

In [17]:
rscaler = preprocessing.RobustScaler()

In [4]:
df = pd.read_csv('df.csv')
df.shape

(5929, 7)

In [47]:
def feat_eng(x):
    #build the model x
    y = pd.DataFrame(index = x['customer_unique_id'].unique())

    #feature0  = number of order
    y['n_order'] = x.groupby('customer_unique_id').order_id.count()

    #feature1  = total value of orders
    #y['payment_total'] = x.groupby('customer_unique_id').order_total.agg('sum')

    #feature2-3  = mean and std of orders
    y['payment_mean'] = x.groupby('customer_unique_id').order_total.agg('mean')
    y['payment_std'] = x.groupby('customer_unique_id').order_total.agg('std')

    #feature frequency4-10 of order on the distrib
    x['order_dis'] = pd.cut(x['order_total'],30, duplicates = 'drop')
    temp = pd.crosstab(x['customer_unique_id'], x['order_dis'], values = x['order_dis'], aggfunc= 'count', margins = True)
    for i in range(temp.shape[1]):
        temp.iloc[:,i] = round((temp.iloc[:,i]/temp.iloc[:,-1])*100,2)

    temp = temp.drop(labels = 'All')
    temp = temp.drop(labels = 'All', axis = 1)
    y = y.join(temp)
    #feature frequency11-135 of category
    temp = pd.crosstab(x['customer_unique_id'], x['product_category_name'], values = x['product_category_name'], aggfunc= 'count', margins = True).fillna(0)
    for i in range(temp.shape[1]):
        temp.iloc[:,i] = round((temp.iloc[:,i]/temp.iloc[:,-1])*100,2)
    temp = temp.drop(labels = 'All')
    temp = temp.drop(labels = 'All', axis = 1)
    y = y.join(temp)

    #feature136-137 review score mean and std
    y['rev_mean'] = x.groupby('customer_unique_id').review_score.agg('mean')
    y['rev_std'] = x.groupby('customer_unique_id').review_score.agg('std')
    #feature138  = days since last order
    y['days_since_last_order'] = x.groupby('customer_unique_id').days_since_order.agg('min')
    #feature139 = "age" of the custumer
    y['age'] = x.groupby('customer_unique_id').days_since_order.agg('max')
    y = y.loc[y['n_order'] > 1]
    return y

In [119]:
def simulate(x):
    start = x['days_since_order'].max()
    end = x['days_since_order'].min()
    year = start - 365
    t0 = x.loc[x['days_since_order'] < year ]
    t0 = feat_eng(t0)
    t0s = rscaler.fit_transform(t0.values)
    clus = cluster.KMeans(n_clusters = 30,random_state=38)
    clus.fit(t0s)
    t0['true_labels'] = clus.labels_
    ari_l = []
    year_n = year
    for i in range(int((year-end)/7)):
        tn = x.loc[x['days_since_order'] < year_n]
        tn = feat_eng(tn)
        tns = rscaler.fit_transform(tn.values)
        clus_n = cluster.KMeans(n_clusters = 30,random_state=38)
        clus_n.fit(tns)
        tn['pred_labels'] = clus_n.labels_
        temp = pd.DataFrame(t0['true_labels']).join(tn['pred_labels'])
        ari_l.append(metrics.adjusted_rand_score(np.array(temp['true_labels']), np.array(temp['pred_labels'])))
        year_n = year_n+7
    return ari_l

In [120]:
test = simulate(df)

In [121]:
x = df
start = x['days_since_order'].max()
end = x['days_since_order'].min()
year = start - 365
t0 = x.loc[x['days_since_order'] < year ]
t0 = feat_eng(t0)
t0_s = rscaler.fit_transform(t0.values)
clus = cluster.KMeans(n_clusters = 30,random_state=38)
clus.fit(t0_s)
true_labels = clus.labels_
t0['true_labels'] = true_labels
ari_l = []
year_n = year
t1 = x.loc[x['days_since_order'] < (year_n+1)]
t1 = feat_eng(t1)
t1_s = rscaler.fit_transform(t1.values)
#clus_n = cluster.KMeans(n_clusters = 30,random_state=38)
clus.fit(t1_s)
pred_labels = clus.labels_
t1['pred_labels'] = pred_labels
temp = pd.DataFrame(t0['true_labels']).join(t1['pred_labels'])
ari_l.append(metrics.adjusted_rand_score(np.array(temp['true_labels']), np.array(temp['pred_labels'])))


In [124]:
t1.shape

(1612, 144)

In [103]:
prout = pd.DataFrame(t0['labels']).join(t1['labels_p'])
np.array(prout['labels'])

array([ 2, 29,  7, ..., 16, 23, 20])

In [127]:
def expl(md_df,name):
    n_order_m = md_df.describe().iloc[1,0]
    payment_m = md_df.describe().iloc[1,1]
    rev_m_m = md_df.describe().iloc[1,-6]
    age_m = md_df.describe().iloc[1,-3]
    #build the explanatory df
    expl_df = pd.DataFrame()
    #first the frequence of the clusters
    expl_df['freq'] = md_df.groupby(name).n_order.count()
    expl_df['freq'] = round((expl_df['freq']/md_df.shape[0])*100,2)
    #the mean ,ratio and max of the number of order
    expl_df['n_order_mean'] = md_df.groupby(name).n_order.agg('mean')
    expl_df['n_order_ratio'] = round(expl_df['n_order_mean']/n_order_m,2)
    expl_df['n_order_max'] = md_df.groupby(name).n_order.agg('max')
    #the mean and the ratio of the mean of payment
    expl_df['payment_m_m'] = md_df.groupby(name).payment_mean.agg('mean')
    expl_df['payment_m_r'] = round(expl_df['payment_m_m']/payment_m,2)
    #the mean and ratio of the mean of review score
    expl_df['rev_m_m'] = md_df.groupby(name).rev_mean.agg('mean')
    expl_df['rev_ratio'] =round(expl_df['rev_m_m']/rev_m_m,2)
    #the mean and ratio of the age of each cluster
    expl_df['age_m_m'] = md_df.groupby(name).age.agg('mean')
    expl_df['age_ratio'] =round(expl_df['age_m_m']/age_m,2)
    #the top2 payment bucket for each cluster
    temp = pd.DataFrame(md_df.groupby(name).mean())
    temp = temp.iloc[:,3:18]
    temp2 = temp.T

    top1=[]
    top2=[]
    for i in range(temp2.shape[1]):    
        val = list(round(temp2.iloc[:,i].nlargest(2),2))
        cat = list(temp2.iloc[:,i].nlargest(2).index)
        cat_top2 = dict(zip(cat, val))
        cat_list = []
        for y in cat_top2:
            cat_list.append(str(y)+':'+str(cat_top2[y]))
        top1.append(cat_list[0])
        top2.append(cat_list[1])
   
       
    expl_df = expl_df.join(pd.DataFrame({'top1_pay' : top1, 'top2_pay' : top2}))
    #the top2 categories for each cluster
    temp = pd.DataFrame(md_df.groupby(name).mean())
    temp = temp.iloc[:,18:-5]
    temp2 = temp.T

    top1=[]
    top2=[]
    for i in range(temp2.shape[1]):    
        val = list(round(temp2.iloc[:,i].nlargest(2),2))
        cat = list(temp2.iloc[:,i].nlargest(2).index)
        cat_top2 = dict(zip(cat, val))
        cat_list = []
        for y in cat_top2:
            cat_list.append(str(y)+':'+str(cat_top2[y]))
        top1.append(cat_list[0])
        top2.append(cat_list[1])
   
       
    expl_df = expl_df.join(pd.DataFrame({'top1_cat' : top1, 'top2_cat' : top2}))

    return expl_df

In [128]:
expl(t0,'true_labels')

NameError: name 'n_order_m' is not defined

In [41]:

df['order_dis'] = pd.cut(x['order_total'],30, duplicates = 'drop')

In [40]:
start = df['days_since_order'].max()
end = df['days_since_order'].min()
print(start, end, int(((start-365)-end)/7))


2254 1554 47


In [6]:
test = df.loc[df['days_since_order'] < (start-365) ]
test.shape

(3896, 7)

In [12]:
md_df = feat_eng(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['order_dis'] = pd.cut(x['order_total'],30, duplicates = 'drop')


In [13]:
md_df.shape


(1605, 139)

In [14]:
md_df['n_order'].describe()

count    1605.000000
mean        2.088474
std         0.440639
min         2.000000
25%         2.000000
50%         2.000000
75%         2.000000
max        10.000000
Name: n_order, dtype: float64