**Task 1:- Data Preprocessing**

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
telco = pd.read_csv('/content/Telco_Customer_Churn_Dataset.csv')

#missing Value
print("\nMissing value in each column: ")
telco.isnull().sum()



In [None]:
#One Hot Encoding
telco['TotalCharges'] = pd.to_numeric(telco['TotalCharges'], errors='coerce')
telco['TotalCharges'] = telco['TotalCharges'].fillna(telco['TotalCharges'].median())

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
telco['Churn'] = le.fit_transform(telco['Churn'])

telco = telco.drop(columns=['customerID'])

# One-Hot Encode to all categorical variables
telco = pd.get_dummies(telco, drop_first=True)


print(telco.info())

telco = telco.astype('float64')

print(telco.dtypes)

print(telco.head())


**Task 2 :-Split Data for Training and Testing**

In [None]:
from sklearn.model_selection import train_test_split
X = telco.drop('Churn', axis=1)  # Features
y = telco['Churn']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

**Task 3 - Feature Selection**

In [None]:
#Correlation_Matrix
import matplotlib.pyplot as plt
import seaborn as sns
correlation_matrix = telco.corr()
plt.figure(figsize=(15,10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', annot_kws={'size':8}, fmt = '.2f', linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In here we have got Contract_Two year	(-0.30) Customers on 2-year contracts are less likely to churn.

Contract_One year	(-0.20) Similar effect but less strong.

tenure	(-0.35) Higher tenure = more loyal = lower churn.

OnlineSecurity_Yes	(-0.28) Customers with online security are less likely to leave.

TechSupport_Yes	(-0.27) Support availability reduces churn.

MonthlyCharges	(+0.19) Slightly higher charges → more churn (but weak).


In [5]:
Selected_features = ['Contract_Two year', 'Contract_One year', 'tenure', 'OnlineSecurity_Yes', 'TechSupport_Yes', 'MonthlyCharges']
X = telco[Selected_features]
y = telco['Churn']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

***Task 4 - Model Selection***


- **Logistic Regression**: Chosen for interpretability and suitability for binary classification with scaled features.
- **Decision Tree**: Selected to capture non-linear relationships in features like `tenure` and `Contract`.
- **Random Forest**: Used for robustness and handling categorical features post one-hot encoding.
- **Gradient Boosting**: Included for improved performance on imbalanced data.


**Task 5 :- Model Training**

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
Classifier = LogisticRegression(random_state = 0)
Classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [9]:
y_pred = Classifier.predict(X_test)
y_pred_proba = Classifier.predict_proba(X_test)  # Predicted probabilities


In [None]:
#Decision Tress
from sklearn.tree import DecisionTreeClassifier
Classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
Classifier.fit(X_train, y_train)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
Classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
Classifier.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train GB model
gb_model = GradientBoostingClassifier(random_state=0)
gb_model.fit(X_train, y_train)


**TASK 6 :- MODEL EVALUATION**

In [None]:
#Logistic Regression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])

print("Accuracy:", accuracy_score(y_test, y_pred))


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

print("AUC Score:\n", auc_score)


In [None]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt_classifier.fit(X_train, y_train)

dt_y_pred = dt_classifier.predict(X_test)
dt_y_pred_proba = dt_classifier.predict_proba(X_test)
auc_score_dt = roc_auc_score(y_test, dt_y_pred_proba[:, 1])
print("Decision Tree - Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))
print("Decision Tree AUC Score:", auc_score_dt)


In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) # Initialize rf_classifier
rf_classifier.fit(X_train, y_train)

rf_y_pred = rf_classifier.predict(X_test)
rf_y_pred_proba = rf_classifier.predict_proba(X_test)
auc_score_rf = roc_auc_score(y_test, rf_y_pred_proba[:, 1])
print("Random Forest - Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))
print("Random Forest AUC Score:", auc_score_rf)


In [None]:
#Gradiant Boost

y_pred_gb = gb_model.predict(X_test)
y_pred_gb_proba = gb_model.predict_proba(X_test)  # Predicted probabilities



print("Gradient Boosting Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
auc_score_gb = roc_auc_score(y_test, y_pred_gb_proba[:, 1])
print("Gradient Boosting AUC Score:", auc_score_gb)

The Summary Table for all evaluation of algorithms used in this .


### Model Evaluation Summary
| Model               | Accuracy | Class 1 Recall | Class 1 Precision | F1 (Class 1) | ROC-AUC |
|---------------------|----------|----------------|-------------------|--------------|---------|
| Logistic Regression | 0.791    | 0.49           | 0.63              | 0.55         | 0.818   |
| Decision Tree       | 0.717    | 0.41           | 0.45              | 0.43         | 0.631   |
| Random Forest       | 0.752    | 0.44           | 0.53              | 0.48         | 0.761   |
| Gradient Boosting   | 0.785    | 0.48           | 0.61              | 0.54         | 0.825   |


### Model Evaluation Insights
Logistic Regression (0.791 accuracy, 0.818 ROC-AUC) and Gradient Boosting (0.785 accuracy, 0.825 ROC-AUC) lead, but low Class 1 recall (0.49, 0.48) shows difficulty predicting churners due to class imbalance (~26% churn). Decision Tree (0.717 accuracy, 0.631 ROC-AUC) overfits. Random Forest (0.752 accuracy, 0.761 ROC-AUC) is moderate. SMOTE could improve recall.
Thank You