In [67]:
from datetime import datetime
import pandas as pd
from sklearn.metrics import classification_report
from __future__ import division
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
tx_data = pd.read_csv('data.csv')

In [6]:
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])

In [8]:
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)

In [10]:
tx_6m = tx_uk[(tx_uk.InvoiceDate < datetime(2011,9,1)) & (tx_uk.InvoiceDate >= datetime(2011,3,1))].reset_index(drop=True)
tx_next = tx_uk[(tx_uk.InvoiceDate >= datetime(2011,9,1)) & (tx_uk.InvoiceDate < datetime(2011,12,1))].reset_index(drop=True)

In [11]:
tx_user = pd.DataFrame(tx_6m['CustomerID'].unique())
tx_user.columns = ['CustomerID']

# Adding label

In [12]:
tx_next_first_purchase = tx_next.groupby('CustomerID').InvoiceDate.min().reset_index()

In [13]:
tx_next_first_purchase.columns = ['CustomerID','MinPurchaseDate']

In [14]:
tx_last_purchase = tx_6m.groupby('CustomerID').InvoiceDate.max().reset_index()

In [15]:
tx_last_purchase.columns = ['CustomerID','MaxPurchaseDate']

In [16]:
tx_purchase_dates = pd.merge(tx_last_purchase,tx_next_first_purchase,on='CustomerID',how='left')

In [17]:
tx_purchase_dates['NextPurchaseDay'] = (tx_purchase_dates['MinPurchaseDate'] - tx_purchase_dates['MaxPurchaseDate']).dt.days



In [18]:
tx_user = pd.merge(tx_user, tx_purchase_dates[['CustomerID','NextPurchaseDay']],on='CustomerID',how='left')

In [19]:
tx_user = tx_user.fillna(999)

# Recency

In [20]:
tx_max_purchase = tx_6m.groupby('CustomerID').InvoiceDate.max().reset_index()

In [21]:
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']

In [22]:
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days

In [23]:
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')

In [25]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

In [26]:
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final


In [27]:
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)

# Frequency

In [29]:
tx_frequency = tx_6m.groupby('CustomerID').InvoiceDate.count().reset_index()

In [30]:
tx_frequency.columns = ['CustomerID','Frequency']

In [31]:
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

In [32]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

In [33]:
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

# Monetary Value

In [34]:
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']

In [35]:
tx_revenue = tx_6m.groupby('CustomerID').Revenue.sum().reset_index()

In [36]:
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

In [37]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])

In [38]:
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)

# Overall Segmentation

In [40]:
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']

In [41]:
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value' 
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value' 

# Adding new features

In [44]:
#create a dataframe with CustomerID and Invoice Date
tx_day_order = tx_6m[['CustomerID','InvoiceDate']]

In [45]:
#Convert Invoice Datetime to day
tx_day_order['InvoiceDay'] = tx_6m['InvoiceDate'].dt.date

In [46]:
tx_day_order = tx_day_order.sort_values(['CustomerID','InvoiceDate'])

In [47]:
#Drop duplicates
tx_day_order = tx_day_order.drop_duplicates(subset=['CustomerID','InvoiceDay'],keep='first')

In [48]:
#shifting last 3 purchase dates
tx_day_order['PrevInvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(1)
tx_day_order['T2InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(2)
tx_day_order['T3InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(3)

In [49]:
tx_day_order['InvoiceDay'] = pd.to_datetime(tx_day_order['InvoiceDay'], errors='coerce')
tx_day_order['PrevInvoiceDate'] = pd.to_datetime(tx_day_order['PrevInvoiceDate'], errors='coerce')
tx_day_order['T2InvoiceDate'] = pd.to_datetime(tx_day_order['T2InvoiceDate'], errors='coerce')
tx_day_order['T3InvoiceDate'] = pd.to_datetime(tx_day_order['T3InvoiceDate'], errors='coerce')
tx_day_order['DayDiff'] = (tx_day_order['InvoiceDay'] - tx_day_order['PrevInvoiceDate']).dt.days
tx_day_order['DayDiff2'] = (tx_day_order['InvoiceDay'] - tx_day_order['T2InvoiceDate']).dt.days
tx_day_order['DayDiff3'] = (tx_day_order['InvoiceDay'] - tx_day_order['T3InvoiceDate']).dt.days

In [50]:
tx_day_diff = tx_day_order.groupby('CustomerID').agg({'DayDiff': ['mean','std']}).reset_index()



In [51]:
tx_day_diff.columns = ['CustomerID', 'DayDiffMean','DayDiffStd']

In [52]:
tx_day_order_last = tx_day_order.drop_duplicates(subset=['CustomerID'],keep='last')

In [53]:
tx_day_order_last = tx_day_order_last.dropna()

In [54]:
tx_day_order_last = pd.merge(tx_day_order_last, tx_day_diff, on='CustomerID')

In [55]:
tx_user = pd.merge(tx_user, tx_day_order_last[['CustomerID','DayDiff','DayDiff2','DayDiff3','DayDiffMean','DayDiffStd']], on='CustomerID')


# Grouping the label

In [56]:
tx_class = tx_user.copy()

In [57]:
tx_class = pd.get_dummies(tx_class)

In [58]:
tx_class['NextPurchaseDayRange'] = 2
tx_class.loc[tx_class.NextPurchaseDay>20,'NextPurchaseDayRange'] = 1
tx_class.loc[tx_class.NextPurchaseDay>50,'NextPurchaseDayRange'] = 0


In [60]:
tx_class = tx_class.drop('NextPurchaseDay',axis=1)

In [61]:
X, y = tx_class.drop('NextPurchaseDayRange',axis=1), tx_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [64]:
xgb_model = xgb.XGBClassifier().fit(X_train, y_train)

print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
       .format(xgb_model.score(X_test[X_train.columns], y_test)))



Accuracy of XGB classifier on training set: 1.00
Accuracy of XGB classifier on test set: 0.58


In [65]:
y_pred = xgb_model.predict(X_test)

In [69]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76        48
           1       0.51      0.38      0.43        48
           2       0.42      0.55      0.48        29

    accuracy                           0.58       125
   macro avg       0.56      0.57      0.56       125
weighted avg       0.58      0.58      0.57       125



In [71]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(), param_grid = param_test1, scoring='accuracy',n_jobs=-1, cv=2)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 3}, 0.5983935742971888)

In [72]:
xgb_model = xgb.XGBClassifier(max_depth=3, min_child_weight=5).fit(X_train, y_train)

print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
       .format(xgb_model.score(X_test[X_train.columns], y_test)))



Accuracy of XGB classifier on training set: 1.00
Accuracy of XGB classifier on test set: 0.50


In [73]:
y_pred = xgb_model.predict(X_test)

In [74]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69        48
           1       0.40      0.33      0.36        48
           2       0.39      0.41      0.40        29

    accuracy                           0.50       125
   macro avg       0.48      0.49      0.48       125
weighted avg       0.49      0.50      0.50       125



In [78]:
len(y_train)

498

In [88]:
import session_info
session_info.show()