# Task 1: Data Preparation

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [5]:
df = pd.read_csv("Telco_Customer_Churn_Dataset  (3).csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Telco_Customer_Churn_Dataset  (3).csv'

In [None]:
df.head()


In [None]:
df.shape


In [None]:
df.columns


In [None]:
df.info()


In [None]:
df.isnull().sum()


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


In [None]:
df.isnull().sum()


In [None]:
df.drop('customerID', axis=1, inplace=True)


In [None]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


In [None]:
df['Churn'].value_counts()


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols


In [None]:
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
X_train.shape, X_test.shape


# Task 2: Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [None]:
churn_rate = df['Churn'].mean()
churn_rate


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='Churn', data=df)
plt.title("Overall Churn Distribution")
plt.xticks([0,1], ['No Churn', 'Churn'])
plt.show()


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='gender', hue='Churn', data=df)
plt.title("Churn by Gender")
plt.xlabel("Gender (0 = Female, 1 = Male)")
plt.show()


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='Partner', hue='Churn', data=df)
plt.title("Churn by Partner Status")
plt.xlabel("Partner (0 = No, 1 = Yes)")
plt.show()


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='Dependents', hue='Churn', data=df)
plt.title("Churn by Dependents")
plt.xlabel("Dependents (0 = No, 1 = Yes)")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['tenure'], bins=30, kde=True)
plt.title("Customer Tenure Distribution")
plt.xlabel("Tenure (Months)")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='Churn', y='tenure', data=df)
plt.title("Tenure vs Churn")
plt.xticks([0,1], ['No Churn', 'Churn'])
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title("Churn by Contract Type")
plt.xlabel("Contract Type")
plt.show()


In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='PaymentMethod', hue='Churn', data=df)
plt.title("Churn by Payment Method")
plt.xticks(rotation=30)
plt.show()


In [None]:
df.groupby('Churn').mean()[['tenure', 'MonthlyCharges', 'TotalCharges']]


# Task 3: Customer Segmentation

In [None]:
segmentation_df = df[['tenure', 'MonthlyCharges', 'Contract', 'Churn']]
segmentation_df.head()


In [None]:
def tenure_segment(tenure):
    if tenure <= 12:
        return 'New Customer'
    elif tenure <= 48:
        return 'Mid-Term Customer'
    else:
        return 'Long-Term Customer'

df['TenureSegment'] = df['tenure'].apply(tenure_segment)


In [None]:
def charges_segment(charges):
    if charges <= 35:
        return 'Low Value'
    elif charges <= 70:
        return 'Medium Value'
    else:
        return 'High Value'

df['ChargesSegment'] = df['MonthlyCharges'].apply(charges_segment)


In [None]:
contract_map = {
    0: 'Month-to-Month',
    1: 'One Year',
    2: 'Two Year'
}

df['ContractType'] = df['Contract'].map(contract_map)


In [None]:
tenure_churn = df.groupby('TenureSegment')['Churn'].mean().reset_index()
tenure_churn


In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='TenureSegment', y='Churn', data=tenure_churn)
plt.title("Churn Rate by Tenure Segment")
plt.ylabel("Churn Rate")
plt.show()


In [None]:
charges_churn = df.groupby('ChargesSegment')['Churn'].mean().reset_index()
charges_churn


In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='ChargesSegment', y='Churn', data=charges_churn)
plt.title("Churn Rate by Customer Value Segment")
plt.ylabel("Churn Rate")
plt.show()


In [None]:
contract_churn = df.groupby('ContractType')['Churn'].mean().reset_index()
contract_churn


In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='ContractType', y='Churn', data=contract_churn)
plt.title("Churn Rate by Contract Type")
plt.ylabel("Churn Rate")
plt.show()


In [None]:
high_risk_customers = df[
    (df['ChargesSegment'] == 'High Value') &
    (df['ContractType'] == 'Month-to-Month') &
    (df['Churn'] == 1)
]

high_risk_customers.head()


In [None]:
high_risk_customers.shape[0]


In [None]:
segment_summary = df.groupby(
    ['TenureSegment', 'ChargesSegment', 'ContractType']
)['Churn'].mean().reset_index()

segment_summary.head(10)


# Task 4: Churn Prediction Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [None]:
# Drop segmentation columns before modeling
X = df.drop(
    ['Churn', 'TenureSegment', 'ChargesSegment', 'ContractType'],
    axis=1
)

y = df['Churn']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)


In [None]:
y_pred_log = log_model.predict(X_test)

print("Logistic Regression Results")
print("Accuracy :", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall   :", recall_score(y_test, y_pred_log))
print("F1 Score :", f1_score(y_test, y_pred_log))


In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Results")
print(classification_report(y_test, y_pred_dt))


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf_model.fit(X_train, y_train)


In [None]:
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Results")
print(classification_report(y_test, y_pred_rf))


In [None]:
model_results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_log),
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_log),
        f1_score(y_test, y_pred_dt),
        f1_score(y_test, y_pred_rf)
    ]
})

model_results


In [None]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance.head(10)


In [None]:
top_features = feature_importance['Feature'].head(10)

X_train_top = X_train[top_features]
X_test_top = X_test[top_features]


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train_top, y_train)


In [None]:
best_rf = grid_search.best_estimator_

y_pred_best = best_rf.predict(X_test_top)

print("Tuned Random Forest Results")
print(classification_report(y_test, y_pred_best))


In [None]:
cm = confusion_matrix(y_test, y_pred_best)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


# Task 5: Model Evaluation and Interpretation

In [None]:
y_pred = best_rf.predict(X_test_top)


In [None]:
print("Final Model Evaluation")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(
    best_rf,
    X_test_top,
    y_test,
    cmap='Blues'
)
plt.title("Confusion Matrix – Random Forest")
plt.show()


In [None]:
feature_importance = pd.DataFrame({
    'Feature': X_train_top.columns,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance


In [None]:
plt.figure(figsize=(8,5))
sns.barplot(
    x='Importance',
    y='Feature',
    data=feature_importance
)
plt.title("Feature Importance – Churn Prediction")
plt.show()


In [None]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coef_df


In [None]:
y_prob = best_rf.predict_proba(X_test_top)[:, 1]


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.3f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Churn Prediction Model")
plt.legend()
plt.show()
