In [2]:
import pandas as pd
from datetime import date, datetime

import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',50)
pd.set_option('display.max_columns',50)
pd.set_option('display.width',100)

In [41]:
df = pd.read_csv(r'D:\Jes\Education\2024 DSI\Week 8 - Team Project\Team Git\team_project\data\raw\new_retail_data.csv')
# Select columns I need
df = df[['Transaction_ID','Customer_ID','City', 'State','Country', 'Age', 'Gender', 'Income',
       'Customer_Segment', 'Date', 'Year', 'Month', 'Time', 'Total_Purchases',
       'Amount', 'Total_Amount', 'Product_Category', 'Product_Brand',
       'Product_Type', 'Feedback', 'Shipping_Method', 'Payment_Method',
       'Order_Status', 'Ratings', 'products']]

# Drop null rows
df = df.dropna()

# Data type conversions
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
# Create a new column to store yearmonth as integer
df['yearmonth'] = df['Date'].dt.strftime('%Y%m').astype(int)

df.head(3)

Unnamed: 0,Transaction_ID,Customer_ID,City,State,Country,Age,Gender,Income,Customer_Segment,Date,Year,Month,Time,Total_Purchases,Amount,Total_Amount,Product_Category,Product_Brand,Product_Type,Feedback,Shipping_Method,Payment_Method,Order_Status,Ratings,products,yearmonth
0,8691788.0,37249.0,Dortmund,Berlin,Germany,21.0,Male,Low,Regular,2023-09-18,2023.0,September,22:03:55,3.0,108.028757,324.08627,Clothing,Nike,Shorts,Excellent,Same-Day,Debit Card,Shipped,5.0,Cycling shorts,202309
1,2174773.0,69749.0,Nottingham,England,UK,19.0,Female,Low,Premium,2023-12-31,2023.0,December,8:42:04,2.0,403.353907,806.707815,Electronics,Samsung,Tablet,Excellent,Standard,Credit Card,Processing,4.0,Lenovo Tab,202312
2,6679610.0,30192.0,Geelong,New South Wales,Australia,48.0,Male,Low,Regular,2023-04-26,2023.0,April,4:06:29,3.0,354.4776,1063.432799,Books,Penguin Books,Children's,Average,Same-Day,Credit Card,Processing,2.0,Sports equipment,202304


K Means Clustering

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 295617 entries, 0 to 302009
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Transaction_ID    295617 non-null  float64       
 1   Customer_ID       295617 non-null  float64       
 2   City              295617 non-null  int64         
 3   State             295617 non-null  object        
 4   Country           295617 non-null  object        
 5   Age               295617 non-null  float64       
 6   Gender            295617 non-null  int64         
 7   Income            295617 non-null  int64         
 8   Customer_Segment  295617 non-null  object        
 9   Date              295617 non-null  datetime64[ns]
 10  Year              295617 non-null  float64       
 11  Month             295617 non-null  object        
 12  Time              295617 non-null  object        
 13  Total_Purchases   295617 non-null  float64       
 14  Amount   

In [38]:
df['Product_Type'].unique()

array(['Shorts', 'Tablet', "Children's", 'Tools', 'Chocolate',
       'Television', 'Shirt', 'Decorations', 'Non-Fiction', 'Water',
       'Snacks', 'T-shirt', 'Literature', 'Juice', 'Furniture', 'Coffee',
       'Bathroom', 'Kitchen', 'Smartphone', 'Shoes', 'Thriller',
       'Soft Drink', 'Laptop', 'Dress', 'Headphones', 'Lighting',
       'Bedding', 'Jacket', 'Fiction', 'Jeans', 'Fridge',
       'Mitsubishi 1.5 Ton 3 Star Split AC', 'BlueStar AC'], dtype=object)

In [46]:
# Encode categorical variables
label_encoders = {}
for column in ['Gender', 'Income', 'Product_Category', 'Product_Brand','City','Product_Type']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Standardize numerical features
scaler = StandardScaler()
df[['Age', 'Total_Purchases', 'Total_Amount','Ratings']] = scaler.fit_transform(df[['Age', 'Total_Purchases', 'Total_Amount','Ratings']])


In [48]:
from sklearn.cluster import KMeans

# Select features for clustering
features = ['Age', 'Gender', 'Income', 'Total_Purchases', 'Total_Amount', 'Product_Category', 'Product_Brand','City','Product_Type','Ratings']

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[features])

df.head(2)


Unnamed: 0,Transaction_ID,Customer_ID,City,State,Country,Age,Gender,Income,Customer_Segment,Date,Year,Month,Time,Total_Purchases,Amount,Total_Amount,Product_Category,Product_Brand,Product_Type,Feedback,Shipping_Method,Payment_Method,Order_Status,Ratings,products,yearmonth,Cluster
0,8691788.0,37249.0,35,Berlin,Germany,-0.963242,1,1,Regular,2023-09-18,2023.0,September,22:03:55,-0.822819,108.028757,-0.924469,1,10,23,Excellent,Same-Day,Debit Card,Shipped,1.391396,Cycling shorts,202309,1
1,2174773.0,69749.0,87,England,UK,-1.096402,0,1,Premium,2023-12-31,2023.0,December,8:42:04,-1.171458,403.353907,-0.496912,2,14,28,Excellent,Standard,Credit Card,Processing,0.634218,Lenovo Tab,202312,2


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Features and target
X = df[features]
y = df['Cluster']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')


Accuracy: 0.9973276503619511


In [56]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with standard scaler and KNN
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Define hyperparameters for grid search
param_grid = {
    'knn__n_neighbors': [3,4, 5,6,7, 9],
    'knn__weights': ['uniform', 'distance']
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model
best_knn = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy}')

Best parameters: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Accuracy: 0.924971246870983


Test Model

In [55]:
# New customer data
new_customer = {
    'Age': 27,
    'Gender': 'Male',
    'Income': 'High',
    'Total_Purchases': 4,
    'Total_Amount': 1200.00,
    'Product_Category': 'Electronics',
    'Product_Brand': 'Samsung'
}
new_customer_df = pd.DataFrame([new_customer])

# Encode and scale new customer data
for column in ['Gender', 'Income', 'Product_Category', 'Product_Brand']:
    new_customer_df[column] = label_encoders[column].transform(new_customer_df[column])

new_customer_df[['Age', 'Total_Purchases', 'Total_Amount']] = pipeline.named_steps['scaler'].transform(new_customer_df[['Age', 'Total_Purchases', 'Total_Amount']])

# Predict cluster for the new customer
predicted_cluster = best_knn.predict(new_customer_df[features])
print(f'Predicted Customer Segment: {predicted_cluster[0]}')

ValueError: invalid literal for int() with base 10: 'Male'