In [None]:
!pip install pandas matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
%matplotlib inline

In [None]:
import os
os.makedirs('data', exist_ok=True)
data = {
    'age': np.random.randint(18, 80, 100),
    'income': np.random.randint(20000, 120000, 100),
    'purchase_frequency': np.random.randint(1, 50, 100),
    'segment': np.random.choice(['A', 'B', 'C'], 100)
}
df = pd.DataFrame(data)
df.to_csv('data/customers_data.csv', index=False)
print("✅ Sample data created and saved to 'data/customers_data.csv'")

In [None]:
try:
    df = pd.read_csv('data/customers_data.csv')
    print(f"✅ Data Loaded. Total Records: {len(df)}")
    display(df.head())  # Use display for better Jupyter output
except FileNotFoundError:
    print("❌ Error: 'data/customers_data.csv' not found. Please provide the correct file path or use the sample data above.")

In [None]:
print("📊 Dataset Info:")
df.info()
print("\n🔍 Missing Values:")
print(df.isnull().sum())
if 'segment' in df.columns:
    print("\n🎯 Customer Segments Distribution:")
    plt.figure(figsize=(8,5))
    sns.countplot(data=df, x='segment')
    plt.title("Existing Segments Distribution")
    plt.show()
else:
    print("⚠️ No 'segment' column found. Skipping segment distribution plot.")

In [None]:
features = ['age', 'income', 'purchase_frequency']
if all(col in df.columns for col in features):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df

[features])
    print("✅ Features Scaled")
else:
    print(f"❌ Error: One or more features {features} not found in dataset.")

In [None]:
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
plt.figure(figsize=(8,5))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans.labels_)
    print(f"Silhouette Score for k={k}: {score:.4f}")

In [None]:
optimal_k = 3  # Adjust based on elbow plot or silhouette scores
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)
print("✅ Clustering Completed. Sample Data:")
display(df.head())

In [None]:
if 'income' in df.columns and 'purchase_frequency' in df.columns and 'cluster' in df.columns:
    plt.figure(figsize=(8,5))
    sns.scatterplot(data=df, x='income', y='purchase_frequency', hue='cluster', palette='viridis')
    plt.title("Customer Segments by Income and Purchase Frequency")
    plt.show()
else:
    print("⚠️ Required columns for scatter plot are missing.")

In [None]:
try:
    df.to_csv('data/customers_with_segments.csv', index=False)
    print("✅ Segmented customer data saved to 'data/customers_with_segments.csv'")
except Exception as e:
    print(f"❌ Error saving file: {e}")