In [34]:
import pandas as pd
import io
df = pd.read_csv('/content/data.csv',encoding="ISO-8859-1",
                         dtype={'CustomerID': str,'InvoiceID': str})
print('Dataframe dimensions:', df.shape)

Dataframe dimensions: (541909, 8)


In [35]:

import pandas as pd
data = pd.read_csv('/content/data.csv', encoding='latin-1')
data = data.dropna(subset=['CustomerID'])
data = data[data['Quantity'] > 0]
print(f"Data after cleaning: {data.shape}")

Data after cleaning: (397924, 8)


In [36]:

data = data[data['Quantity'] > 0]
print("Data after removing negative quantities:", data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 397924 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    397924 non-null  object 
 1   StockCode    397924 non-null  object 
 2   Description  397924 non-null  object 
 3   Quantity     397924 non-null  int64  
 4   InvoiceDate  397924 non-null  object 
 5   UnitPrice    397924 non-null  float64
 6   CustomerID   397924 non-null  float64
 7   Country      397924 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 27.3+ MB
Data after removing negative quantities: None


In [38]:

data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

if 'BasketPrice' not in data.columns:
  data['BasketPrice'] = data['UnitPrice'] * data['Quantity']

reference_date = data['InvoiceDate'].max()

# Calculate RFM features
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'BasketPrice': 'sum'  # Monetary Value
}).reset_index()


rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']
print(rfm.head())

   CustomerID  Recency  Frequency  Monetary
0     12346.0      325          1  77183.60
1     12347.0        1          7   4310.00
2     12348.0       74          4   1797.24
3     12349.0       18          1   1757.55
4     12350.0      309          1    334.40


In [39]:
from sklearn.model_selection import train_test_split

X = rfm[['Recency', 'Frequency', 'Monetary']]
y = (rfm['Monetary'] > rfm['Monetary'].median()).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


model_results = {}

# Support Vector Machine Classifier (SVC)
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_y_pred = svc_model.predict(X_test)
model_results['SVC'] = {
    'Accuracy': accuracy_score(y_test, svc_y_pred),
    'Precision': precision_score(y_test, svc_y_pred),
    'Recall': recall_score(y_test, svc_y_pred),
    'F1-Score': f1_score(y_test, svc_y_pred)
}

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
model_results['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, lr_y_pred),
    'Precision': precision_score(y_test, lr_y_pred),
    'Recall': recall_score(y_test, lr_y_pred),
    'F1-Score': f1_score(y_test, lr_y_pred)
}

# k-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
model_results['k-NN'] = {
    'Accuracy': accuracy_score(y_test, knn_y_pred),
    'Precision': precision_score(y_test, knn_y_pred),
    'Recall': recall_score(y_test, knn_y_pred),
    'F1-Score': f1_score(y_test, knn_y_pred)
}

#  Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
model_results['Decision Tree'] = {
    'Accuracy': accuracy_score(y_test, dt_y_pred),
    'Precision': precision_score(y_test, dt_y_pred),
    'Recall': recall_score(y_test, dt_y_pred),
    'F1-Score': f1_score(y_test, dt_y_pred)
}

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
model_results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, rf_y_pred),
    'Precision': precision_score(y_test, rf_y_pred),
    'Recall': recall_score(y_test, rf_y_pred),
    'F1-Score': f1_score(y_test, rf_y_pred)
}

# AdaBoost
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)
ada_y_pred = ada_model.predict(X_test)
model_results['AdaBoost'] = {
    'Accuracy': accuracy_score(y_test, ada_y_pred),
    'Precision': precision_score(y_test, ada_y_pred),
    'Recall': recall_score(y_test, ada_y_pred),
    'F1-Score': f1_score(y_test, ada_y_pred)
}

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
model_results['Gradient Boosting'] = {
    'Accuracy': accuracy_score(y_test, gb_y_pred),
    'Precision': precision_score(y_test, gb_y_pred),
    'Recall': recall_score(y_test, gb_y_pred),
    'F1-Score': f1_score(y_test, gb_y_pred)
}

# Compare model results
results_df = pd.DataFrame(model_results).transpose()
results_df



Unnamed: 0,Accuracy,Precision,Recall,F1-Score
SVC,0.971198,1.0,0.945295,0.971879
Logistic Regression,1.0,1.0,1.0,1.0
k-NN,0.997696,0.997812,0.997812,0.997812
Decision Tree,0.998848,1.0,0.997812,0.998905
Random Forest,0.998848,1.0,0.997812,0.998905
AdaBoost,0.998848,1.0,0.997812,0.998905
Gradient Boosting,0.998848,1.0,0.997812,0.998905


In [33]:
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
SVC,0.971198,1.0,0.945295,0.971879
Logistic Regression,1.0,1.0,1.0,1.0
k-NN,0.997696,0.997812,0.997812,0.997812
Decision Tree,0.998848,1.0,0.997812,0.998905
Random Forest,0.998848,1.0,0.997812,0.998905
AdaBoost,0.998848,1.0,0.997812,0.998905
Gradient Boosting,0.998848,1.0,0.997812,0.998905
