In [27]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn.preprocessing import StandardScaler
from datetime import datetime
 
df = pd.read_csv("marketing_campaign.csv",sep='\t',header=(0))
df = df.dropna() # income contains nulls
print(df.nunique())

#Feature engineering
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')
df['Days_Since_Joining'] = (pd.to_datetime('today') - df['Dt_Customer']).dt.days
df['Year_Enrolled'] = df['Dt_Customer'].dt.year
df = df.drop(['Dt_Customer', 'Z_Revenue', 'Z_CostContact', 'ID'], axis=1)
df = pd.get_dummies(df, columns=['Education', 'Marital_Status']).astype(int)

print(df.info())

ID                     2216
Year_Birth               59
Education                 5
Marital_Status            8
Income                 1974
Kidhome                   3
Teenhome                  3
Dt_Customer             662
Recency                 100
MntWines                776
MntFruits               158
MntMeatProducts         554
MntFishProducts         182
MntSweetProducts        176
MntGoldProds            212
NumDealsPurchases        15
NumWebPurchases          15
NumCatalogPurchases      14
NumStorePurchases        14
NumWebVisitsMonth        16
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp1              2
AcceptedCmp2              2
Complain                  2
Z_CostContact             1
Z_Revenue                 1
Response                  2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 2216 entries, 0 to 2239
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype
---  ------          

In [None]:
df_agg = df.groupby('Year_Birth').agg({
    'MntFruits': 'sum',
    'MntMeatProducts': 'sum',
    'MntFishProducts': 'sum'
}).reset_index()
 
X = df_agg[['MntMeatProducts', 'MntFishProducts']]
 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
 
kmeans = KMeans(n_clusters=5, random_state=42)  # 3 clusters, change if needed
df_agg['Cluster'] = kmeans.fit_predict(X_scaled)
 
 
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=df_agg['Cluster'], cmap='viridis')
plt.xlabel('Scaled meat')
plt.ylabel('Scaled fish')
plt.title('Customer Segments based on Order Quantity and Value')
plt.show()
 
# Step 8: View the clustered data
print(df_agg[['Cluster', 'Year_Birth', 'MntFruits', 'MntMeatProducts', 'MntFishProducts']])