#**Use case : CLASSIFICATION**  
dataset ➡ https://www.kaggle.com/datasets/undersc0re/predict-the-churn-risk-rate  
*Churn rate is a marketing metric that describes the number of customers who leave a business over a specific time period. . Every user is assigned a prediction value that estimates their state of churn at any given time. This value is based on:*
1. User demographic information
2. Browsing behavior
3. Historical purchase data among other information

> # Data cleaning _ Classification

> importing dataset

In [1]:
#import of the dfs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
churn_df=pd.read_csv("churn.csv")

ModuleNotFoundError: No module named 'pandas'

> datasets overview

In [None]:
#a look into the dfs
churn_df

> handling missing values

In [None]:

# Handling missing values
churn_df.fillna(churn_df.mean(), inplace=True)
churn_df.fillna(churn_df.mode().iloc[0], inplace=True)

# Correct Data Types
churn_df['joining_date'] = pd.to_datetime(churn_df['joining_date'])
churn_df['last_visit_time'] = pd.to_datetime(churn_df['last_visit_time'])

categorical_columns = ['region_category', 'membership_category', 'joined_through_referral', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'used_special_discount', 'offer_application_preference', 'past_complaint', 'complaint_status', 'feedback']
for col in categorical_columns:
    churn_df[col] = churn_df[col].astype('category')

numeric_columns = ['avg_transaction_value', 'points_in_wallet']
for col in numeric_columns:
    churn_df[col] = churn_df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Remove Duplicates
churn_df.drop_duplicates(inplace=True)

# Handle Outliers
churn_df = churn_df[(np.abs(churn_df['age'] - churn_df['age'].mean()) <= (3 * churn_df['age'].std()))]

# Feature Engineering
churn_df['membership_duration'] = (pd.to_datetime('now') - churn_df['joining_date']).dt.days
churn_df['age_group'] = pd.cut(churn_df['age'], bins=[0, 18, 35, 60, 100], labels=['<18', '18-35', '35-60', '>60'])

# One-Hot Encoding
churn_df = pd.get_dummies(churn_df, columns=['region_category', 'membership_category'])


> # Data visualisation _ Classification

> Histogram:

In [None]:
# Visualize the distribution of customer age
churn_df['age'].hist()
plt.title('Distribution of Customer Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

> Box Plots:

In [None]:
# Display the distribution of average transaction value
sns.boxplot(y=churn_df['avg_transaction_value'])
plt.title('Box Plot of Average Transaction Value')
plt.ylabel('Average Transaction Value')
plt.show()

# **The Classification model**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier  # Added AdaBoost
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
churn_df = pd.read_csv("churn.csv")

In [None]:
# Handling missing values
churn_df.fillna(churn_df.mean(), inplace=True)
churn_df.fillna(churn_df.mode().iloc[0], inplace=True)

# Correct Data Types
churn_df['joining_date'] = pd.to_datetime(churn_df['joining_date'])
churn_df['last_visit_time'] = pd.to_datetime(churn_df['last_visit_time'])

categorical_columns = ['region_category', 'membership_category', 'joined_through_referral', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'used_special_discount', 'offer_application_preference', 'past_complaint', 'complaint_status', 'feedback']
for col in categorical_columns:
    churn_df[col] = churn_df[col].astype('category')

numeric_columns = ['avg_transaction_value', 'points_in_wallet']
for col in numeric_columns:
    churn_df[col] = churn_df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Remove Duplicates
churn_df.drop_duplicates(inplace=True)

# Handle Outliers
churn_df = churn_df[(np.abs(churn_df['age'] - churn_df['age'].mean()) <= (3 * churn_df['age'].std()))]

# Feature Engineering
churn_df['membership_duration'] = (pd.to_datetime('now') - churn_df['joining_date']).dt.days
churn_df['age_group'] = pd.cut(churn_df['age'], bins=[0, 18, 35, 60, 100], labels=['<18', '18-35', '35-60', '>60'])

# One-Hot Encoding
churn_df = pd.get_dummies(churn_df, columns=['region_category', 'membership_category'])


In [None]:
# Data visualisation - Classification
# Visualize the distribution of customer age
churn_df['age'].hist()
plt.title('Distribution of Customer Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Display the distribution of average transaction value
sns.boxplot(y=churn_df['avg_transaction_value'])
plt.title('Box Plot of Average Transaction Value')
plt.ylabel('Average Transaction Value')
plt.show()


In [None]:
# AdaBoost Model
# Feature Selection
features = ['age', 'avg_transaction_value', 'points_in_wallet', 'membership_duration', 'region_category_City', 'region_category_Town', 'region_category_Village']
X = churn_df[features]
y = churn_df['churn_risk_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create AdaBoost classifier
adaboost_model = AdaBoostClassifier(n_estimators=50, random_state=42)  # You can adjust the number of estimators as needed


In [None]:

# Train the AdaBoost model
adaboost_model.fit(X_train, y_train)

# Test the AdaBoost model
y_pred_adaboost = adaboost_model.predict(X_test)

The goal here is to create a classification model to classify the churn risk score

In [None]:
# Evaluate the performance of the AdaBoost model
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
classification_report_output_adaboost = classification_report(y_test, y_pred_adaboost)
conf_matrix_adaboost = confusion_matrix(y_test, y_pred_adaboost)

# Print results for AdaBoost
print('\nAdaBoost Model:')
print(f'Accuracy: {accuracy_adaboost}')
print('Classification Report:')
print(classification_report_output_adaboost)
print('Confusion Matrix:')
print(conf_matrix_adaboost)
