In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [6]:
df = pd.read_csv('C:/Users/muzna/anaconda_projects/ecommerce_cleaned.csv', parse_dates=['InvoiceDate'])
snapshot = df['InvoiceDate'].max() + pd.Timedelta(days=1)
last_purchase = df.groupby('CustomerID')['InvoiceDate'].max().reset_index()
last_purchase['Churn'] = ((snapshot - last_purchase['InvoiceDate']).dt.days > 180).astype(int)

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,INV100000,NOTE350,Notebook A4,1,2020-02-21 10:00:48,17.6,C2309,USA,17.6
1,INV100001,NOTE189,Notebook A4,2,2023-10-17 19:51:22,26.51,C2508,Spain,53.02
2,INV100002,PEN338,"Pen, Blue Ink",1,2020-07-10 07:57:37,13.11,C1065,Germany,13.11
3,INV100003,PHONEC818,Phone Charger,1,2021-02-11 23:39:41,20.79,C1054,Germany,20.79
4,INV100004,HEADPH928,Headphones,1,2023-04-21 10:07:43,17.36,C1451,Germany,17.36


In [7]:
# Build RFM features
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index().rename(columns={'InvoiceDate':'Recency','InvoiceNo':'Frequency','TotalPrice':'Monetary'})


In [None]:
data = last_purchase.merge(rfm, on='CustomerID', how='left').fillna(0)
data = data.rename(columns={'InvoiceDate':'LastPurchase'})

In [11]:
# Simple model
X = data[['Recency','Frequency','Monetary']]
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       280
           1       1.00      1.00      1.00       120

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

ROC AUC: 1.0
