In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#1. Load the Data
df=pd.read_csv("/content/drive/MyDrive/END TO END DATA ANALYST PROJECT/customer_shopping_behavior.csv")
df

# 2. Normalize frequency categories
df["Frequency of Purchases"] = df["Frequency of Purchases"].replace({
    "Fortnightly": "Biweekly",
    "Bi-Weekly": "Biweekly",
    "Every 3 Months": "Quarterly"
})

# 3. Create churn Column
df["Churn"] = df["Frequency of Purchases"].isin(["Quarterly", "Annually"]).astype(int)


# 4. Check churn distribution
print(df["Churn"].value_counts())   #value_counts() returns the number of observations in each class.
print()
print(df["Churn"].value_counts(normalize=True)) #value_counts(normalize=True) returns the proportion of observations in each class.
print()
print()
print()


#7. Behavioral features Engineering

df["Low_History"] = (df["Previous Purchases"] <= 13).astype(int)
df["High_History"] = (df["Previous Purchases"] >= 38).astype(int)


#8. New_Churn Implementation

median_amt = df["Purchase Amount (USD)"].median()

risk_frequency = df["Frequency of Purchases"].isin(["Quarterly", "Annually"]).astype(int)
risk_purchase_history = (df["Previous Purchases"] <= 13).astype(int)
risk_subscription_status=(df["Subscription Status"] == "No").astype(int)
risk_purchase_amount= (df["Purchase Amount (USD)"] <= median_amt).astype(int)

risk_sum = risk_frequency + risk_purchase_history + risk_subscription_status + risk_purchase_amount

df["Churn_New"] = (risk_sum >= 2).astype(int)


#9. New_Churn_Distribution

print(df["Churn_New"].value_counts())
print()
print(df["Churn_New"].value_counts(normalize=True))


# 5. Feature selection

drop_cols = [
    "Customer ID",
    "Frequency of Purchases",
    "Item Purchased",
    "Color",
    "Size",
    "Season",
    "Shipping Type"
]

X = df.drop(columns=drop_cols + ["Churn","Churn_New"])
y = df["Churn_New"]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)
print()
print()
# print(X.dtypes)


#6.Encoding
categorical_cols=["Gender","Category","Location","Subscription Status","Discount Applied","Promo Code Used","Payment Method"]
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

X_encoded.shape
print()
print()
print(X_encoded.isnull().sum().sum())


X_encoded.isnull().sum().sort_values(ascending=False).head(10)


RR_median = X_encoded["Review Rating"].median()
X_encoded["Review Rating"] = X_encoded["Review Rating"].fillna(RR_median)

print()
print()
print("After median imputation the null value are :-",X_encoded.isnull().sum().sum())

df

Churn
0    2181
1    1719
Name: count, dtype: int64

Churn
0    0.559231
1    0.440769
Name: proportion, dtype: float64



Churn_New
1    2634
0    1266
Name: count, dtype: int64

Churn_New
1    0.675385
0    0.324615
Name: proportion, dtype: float64
Feature shape: (3900, 13)
Target shape: (3900,)




37


After median imputation the null value are :- 0


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,...,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Churn,Low_History,High_History,Churn_New
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,...,Express,Yes,Yes,14,Venmo,Biweekly,0,0,0,0
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,...,Express,Yes,Yes,2,Cash,Biweekly,0,1,0,0
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,...,Free Shipping,Yes,Yes,23,Credit Card,Weekly,0,0,0,0
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,...,Next Day Air,Yes,Yes,49,PayPal,Weekly,0,0,1,0
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,...,Free Shipping,Yes,Yes,31,PayPal,Annually,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,3896,40,Female,Hoodie,Clothing,28,Virginia,L,Turquoise,Summer,...,2-Day Shipping,No,No,32,Venmo,Weekly,0,0,0,1
3896,3897,52,Female,Backpack,Accessories,49,Iowa,L,White,Spring,...,Store Pickup,No,No,41,Bank Transfer,Biweekly,0,0,1,1
3897,3898,46,Female,Belt,Accessories,33,New Jersey,L,Green,Spring,...,Standard,No,No,24,Venmo,Quarterly,1,0,0,1
3898,3899,44,Female,Shoes,Footwear,77,Minnesota,S,Brown,Summer,...,Express,No,No,24,Venmo,Weekly,0,0,0,0


# **Model Selection**




In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model=LogisticRegression(max_iter=1000)

model.fit(X_train,y_train)

y_pred=model.predict(X_test)

accuracy = accuracy_score(y_pred,y_test)
print(accuracy)
print()
print()

from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)

print("ROC-AUC:", auc)


0.8051282051282052


ROC-AUC: 0.9077355687606112


In [3]:
from sklearn.ensemble import RandomForestClassifier

RF=RandomForestClassifier()

RF.fit(X_train,y_train)

y_pred=RF.predict(X_test)

accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)

print("ROC-AUC:", auc)


0.8051282051282052
ROC-AUC: 0.9077355687606112


In [4]:
df["Previous Purchases"].describe()
# Questions:-

#what was the problem why our both model was weak why rf model also score 0.52 ?
#Answer:- the model does not finds any clear signals till now

# what was the reason behind using the roc-auc in both the model
#Answer:-

# what do you meant by behavioural feature
#Answer:- The features or the columns that shows the behavioural information about the customer like Subscription Status, Previous Purchases, Discount Applied ,Frequency of Purchases etc

Unnamed: 0,Previous Purchases
count,3900.0
mean,25.351538
std,14.447125
min,1.0
25%,13.0
50%,25.0
75%,38.0
max,50.0


# **Strongest Churn Drivers**

In [5]:
import pandas as pd

feature_importance = pd.Series(model.coef_[0], index=X_encoded.columns)
feature_importance = feature_importance.sort_values(key=abs, ascending=False)

print(feature_importance.head(10))


Low_History                3.483292
Subscription Status_Yes   -3.387732
Location_Missouri          0.749565
Location_North Carolina   -0.695176
Location_Washington        0.636200
Location_Hawaii           -0.619889
Location_New Jersey        0.567081
Location_Massachusetts    -0.510925
Location_Pennsylvania     -0.478274
Location_Idaho             0.448044
dtype: float64


In [7]:
print(feature_importance.head(5))
print()
print(feature_importance.tail(5))


Low_History                3.483292
Subscription Status_Yes   -3.387732
Location_Missouri          0.749565
Location_North Carolina   -0.695176
Location_Washington        0.636200
dtype: float64

Location_Mississippi    0.009962
Age                    -0.001808
Location_Arizona        0.001067
Location_Kansas         0.000818
Location_Utah           0.000043
dtype: float64


# **Model Performance**

The enhanced churn prediction system was evaluated on a retail customer dataset containing **3,900** users and **65 engineered behavioral features**. Models were assessed using accuracy and ROC-AUC to measure classification performance and class separability.

1. **Logistic Regression**

   *  Accuracy: 80.5%

   *  ROC-AUC: 0.91


2. **Random Forest**

   * Accuracy: 79.0%

   * ROC-AUC: 0.91

The high ROC-AUC demonstrates strong separability between churn-risk and active customers based on engineered behavioral engagement signals.


**Target Engineering Impact**

The multi-factor churn definition (frequency, purchase history, subscription status, and spending) **improved ROC-AUC from ~0.53 (frequency-only baseline) to ~0.91, highlighting the effectiveness of behavioral target engineering in improving predictive performance.**
