<a href="https://colab.research.google.com/github/Govindsanthosh0/Automated-Garage-Door-System/blob/main/ML%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# === STEP 1: Upload + Quick Clean ===
from google.colab import files
up = files.upload()  # choose Walmart_customer_purchases.csv

import io, pandas as pd, numpy as np
name = next(iter(up))
df = pd.read_csv(io.BytesIO(up[name]))

# Standardize expected columns
expected = ['Customer_ID','Age','Gender','City','Category','Product_Name',
            'Purchase_Date','Purchase_Amount','Payment_Method',
            'Discount_Applied','Rating','Repeat_Customer']
print("Missing (ok if empty):", [c for c in expected if c not in df.columns])

# Basic cleaning
df['Gender'] = df['Gender'].astype(str).str.title().replace({'Nan':'Other'})
df['Discount_Applied'] = df['Discount_Applied'].astype(str).str.title().map({'Yes':1,'No':0})
df['Repeat_Customer']  = df['Repeat_Customer'].astype(str).str.title().map({'Yes':1,'No':0})
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], dayfirst=True, errors='coerce')

for col in ['Age','Purchase_Amount','Rating']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].median())

df = df.dropna(subset=['Repeat_Customer'])  # ensure target present

print("Shape:", df.shape)
print("Nulls per column:\n", df.isna().sum())
print("Target balance:\n", df['Repeat_Customer'].value_counts())

# Save for next steps
df.to_csv('/content/walmart_clean.csv', index=False)
print("Saved -> /content/walmart_clean.csv")


Saving Walmart_customer_purchases.csv.csv to Walmart_customer_purchases.csv (1).csv
Missing (ok if empty): []


  df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], dayfirst=True, errors='coerce')


Shape: (50000, 12)
Nulls per column:
 Customer_ID         0
Age                 0
Gender              0
City                0
Category            0
Product_Name        0
Purchase_Date       0
Purchase_Amount     0
Payment_Method      0
Discount_Applied    0
Rating              0
Repeat_Customer     0
dtype: int64
Target balance:
 Repeat_Customer
1    25244
0    24756
Name: count, dtype: int64
Saved -> /content/walmart_clean.csv


In [3]:
# === STEP 2: Feature Engineering + Feature Reduction ===

df = pd.read_csv('/content/walmart_clean.csv', parse_dates=['Purchase_Date'])

# Create new features
df['age_group'] = pd.cut(df['Age'], bins=[0,25,45,120], labels=['Youth','Adult','Senior'])
df['purchase_month'] = df['Purchase_Date'].dt.month
df['purchase_day'] = df['Purchase_Date'].dt.day
df['is_weekend'] = df['Purchase_Date'].dt.weekday.isin([5,6]).astype(int)

# Reduce category cardinality
top_categories = df['Category'].value_counts().index[:5]
df['Category'] = df['Category'].where(df['Category'].isin(top_categories), 'Other')

# Reduce city cardinality
top_cities = df['City'].value_counts().index[:10]
df['City'] = df['City'].where(df['City'].isin(top_cities), 'Other')

# Drop unneeded columns
df_model = df.drop(columns=['Customer_ID','Product_Name','Purchase_Date'])

# One-hot encoding
df_model = pd.get_dummies(df_model, drop_first=True)

# Split X and y
y = df_model['Repeat_Customer']
X = df_model.drop(columns=['Repeat_Customer'])

print("âœ… New Shape:", X.shape)
df_model.head()

# Save for ML step
X.to_csv('/content/X_ready.csv', index=False)
y.to_csv('/content/y_ready.csv', index=False)
print("Saved -> X_ready.csv & y_ready.csv")


âœ… New Shape: (50000, 27)
Saved -> X_ready.csv & y_ready.csv


In [4]:
# === STEP 3: Random Forest ML Model ===

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
import pandas as pd
X = pd.read_csv('/content/X_ready.csv')
y = pd.read_csv('/content/y_ready.csv').values.ravel()

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
rf = RandomForestClassifier(
    n_estimators=120,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predict & Score
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("âœ… Random Forest Accuracy:", round(acc*100, 2), "%\n")
print("Confusion Matrix:\n", cm, "\n")
print("Classification Report:\n", report)


âœ… Random Forest Accuracy: 49.79 %

Confusion Matrix:
 [[2445 2506]
 [2515 2534]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.49      0.49      4951
           1       0.50      0.50      0.50      5049

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000



In [5]:
# === Improved Feature Engineering ===

df = pd.read_csv('/content/walmart_clean.csv', parse_dates=['Purchase_Date'])

df['age_group'] = pd.cut(df['Age'], bins=[0,25,45,120], labels=['Youth','Adult','Senior'])
df['purchase_month'] = df['Purchase_Date'].dt.month
df['purchase_day'] = df['Purchase_Date'].dt.day
df['is_weekend'] = df['Purchase_Date'].dt.weekday.isin([5,6]).astype(int)

# Keep top 20 cities
top_cities = df['City'].value_counts().index[:20]
df['City'] = df['City'].where(df['City'].isin(top_cities), 'Other')

# Keep top 8 categories
top_categories = df['Category'].value_counts().index[:8]
df['Category'] = df['Category'].where(df['Category'].isin(top_categories), 'Other')

df_model = df.drop(columns=['Customer_ID','Product_Name','Purchase_Date'])
df_model = pd.get_dummies(df_model, drop_first=True)

y = df_model['Repeat_Customer']
X = df_model.drop(columns=['Repeat_Customer'])

print("âœ… New Shape:", X.shape)

X.to_csv('/content/X_ready_v2.csv', index=False)
y.to_csv('/content/y_ready_v2.csv', index=False)
print("Saved new version âœ…")


âœ… New Shape: (50000, 37)
Saved new version âœ…


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

X = pd.read_csv('/content/X_ready_v2.csv')
y = pd.read_csv('/content/y_ready_v2.csv').values.ravel()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

rf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("âœ… Improved Accuracy:", round(acc*100, 2), "%")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


âœ… Improved Accuracy: 49.87 %

Confusion Matrix:
 [[2431 2520]
 [2493 2556]]

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.49      0.49      4951
           1       0.50      0.51      0.50      5049

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000



In [7]:
# === K-Means Customer Segmentation ===

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Select numeric features
features = ['Age', 'Purchase_Amount', 'Rating']
df_cluster = df[features]

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

# Train K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add back into dataset
df['Cluster'] = clusters

# Evaluate
score = silhouette_score(X_scaled, clusters)

print("âœ… K-Means Clusters Created")
print("Silhouette Score:", round(score,3))
df[['Age','Purchase_Amount','Rating','Cluster']].head()


âœ… K-Means Clusters Created
Silhouette Score: 0.27


Unnamed: 0,Age,Purchase_Amount,Rating,Cluster
0,49,253.26,1,2
1,36,73.19,1,1
2,52,125.62,1,1
3,47,450.32,2,2
4,43,369.28,2,2


In [8]:
# === STEP 5: Detailed Cluster Summary ===

cluster_summary = df.groupby('Cluster')[['Age','Purchase_Amount','Rating']].mean().round(2)
cluster_counts = df['Cluster'].value_counts().sort_index()

cluster_summary['Count'] = cluster_counts.values
cluster_summary['% of Customers'] = round((cluster_summary['Count'] / len(df)) * 100, 2)

print("ðŸ“Œ Detailed Cluster Summary:\n")
print(cluster_summary)

# Export for dashboard
cluster_summary.to_csv('/content/cluster_summary.csv')
df.to_csv('/content/clustered_customers.csv', index=False)


ðŸ“Œ Detailed Cluster Summary:

           Age  Purchase_Amount  Rating  Count  % of Customers
Cluster                                                       
0        27.93           311.28    4.14  12533           25.07
1        33.68           132.24    1.87  12363           24.73
2        44.24           382.22    1.93  12862           25.72
3        49.97           189.86    4.09  12242           24.48


In [9]:
# Assign business-friendly cluster names
cluster_labels = {
    0: 'Young High Spenders',
    1: 'Deal Seekers (Low Spend, Happy)',
    2: 'Unhappy Premium Customers',
    3: 'Loyal Budget Buyers'
}

df['Segment'] = df['Cluster'].map(cluster_labels)

# Show sample
df[['Age','Purchase_Amount','Rating','Cluster','Segment']].head(10)


Unnamed: 0,Age,Purchase_Amount,Rating,Cluster,Segment
0,49,253.26,1,2,Unhappy Premium Customers
1,36,73.19,1,1,"Deal Seekers (Low Spend, Happy)"
2,52,125.62,1,1,"Deal Seekers (Low Spend, Happy)"
3,47,450.32,2,2,Unhappy Premium Customers
4,43,369.28,2,2,Unhappy Premium Customers
5,59,435.48,3,2,Unhappy Premium Customers
6,47,231.04,4,3,Loyal Budget Buyers
7,23,240.91,2,1,"Deal Seekers (Low Spend, Happy)"
8,60,179.76,5,3,Loyal Budget Buyers
9,52,463.43,3,2,Unhappy Premium Customers


In [10]:
df.to_csv('/content/customer_segments_final.csv', index=False)
print("âœ… Exported: customer_segments_final.csv")


âœ… Exported: customer_segments_final.csv
