In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

In [2]:

df = pd.read_csv("D:/New download/Telco-Customer-Churn.csv")
print("Initial shape:", df.shape)

Initial shape: (7043, 21)


In [3]:
duplicates = df.duplicated().sum()
print("Duplicate rows:", duplicates)
df.drop_duplicates(inplace=True)

Duplicate rows: 0


In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
missing_total = df['TotalCharges'].isnull().sum()
print("Missing TotalCharges:", missing_total)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

Missing TotalCharges: 11


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [6]:
df.drop('customerID', axis=1, inplace=True)

In [7]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [8]:
yes_no_cols = [
    'Partner','Dependents','PhoneService','PaperlessBilling',
    'OnlineSecurity','OnlineBackup','DeviceProtection',
    'TechSupport','StreamingTV','StreamingMovies'
]

for col in yes_no_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

df['contract_risk'] = df['Contract'].map({'Month-to-month': 3, 'One year': 2, 'Two year': 1})

df['payment_risk'] = df['PaymentMethod'].apply(lambda x: 2 if x == 'Electronic check' else 1)

service_cols = [
    'OnlineSecurity','OnlineBackup','DeviceProtection',
    'TechSupport','StreamingTV','StreamingMovies'
]
df['total_services'] = df[service_cols].sum(axis=1)

df.drop(['Contract','PaymentMethod'], axis=1, inplace=True)

df = pd.get_dummies(df, drop_first=True)

In [9]:

X = df.drop('Churn', axis=1)
y = df['Churn']

feature_columns = X.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:

num_cols = ['tenure','MonthlyCharges','TotalCharges']
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [11]:
model = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_leaf=5, class_weight='balanced', random_state=42, n_jobs=-1)

model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83      1035
           1       0.55      0.75      0.63       374

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.77      0.78      1409

ROC-AUC Score: 0.8413521403291224


In [13]:

joblib.dump(model, "churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(feature_columns, "feature_columns.pkl")

print("All artifacts saved: churn_model.pkl, scaler.pkl, feature_columns.pkl")

All artifacts saved: churn_model.pkl, scaler.pkl, feature_columns.pkl


In [16]:
import pandas as pd
import joblib

model = joblib.load("churn_model.pkl")
scaler = joblib.load("scaler.pkl")
feature_columns = joblib.load("feature_columns.pkl")

def preprocess_input(data):
    df = pd.DataFrame([data])

    yes_no_cols = [
        'Partner','Dependents','PhoneService','PaperlessBilling',
        'OnlineSecurity','OnlineBackup','DeviceProtection',
        'TechSupport','StreamingTV','StreamingMovies'
    ]
    for col in yes_no_cols:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

    contract_map = {'Month-to-month': 3, 'One year': 2, 'Two year': 1}
    df['contract_risk'] = df['Contract'].map(contract_map)

    df['payment_risk'] = df['PaymentMethod'].apply(
        lambda x: 2 if x == 'Electronic check' else 1
    )

    service_cols = [
        'OnlineSecurity','OnlineBackup','DeviceProtection',
        'TechSupport','StreamingTV','StreamingMovies'
    ]
    df['total_services'] = df[service_cols].sum(axis=1)

    df.drop(['Contract','PaymentMethod'], axis=1, inplace=True)

    df = pd.get_dummies(df)

    df = df.reindex(columns=feature_columns, fill_value=0)

    num_cols = ['tenure','MonthlyCharges','TotalCharges']
    df[num_cols] = scaler.transform(df[num_cols])

    return df

def risk_level(prob):
    if prob < 0.3:
        return "Low Risk", "No Action"
    elif prob < 0.6:
        return "Medium Risk", "Send Retention Email / Offer Discount"
    else:
        return "High Risk", "Call Retention Team / Premium Offer"

In [17]:
sample_customer = {
    "gender": "Female",
    "SeniorCitizen": 0,
    "Partner": "No",
    "Dependents": "No",
    "tenure": 12,
    "PhoneService": "Yes",
    "PaperlessBilling": "Yes",
    "Contract": "Month-to-month",
    "PaymentMethod": "Electronic check",
    "OnlineSecurity": "No",
    "OnlineBackup": "Yes",
    "DeviceProtection": "No",
    "TechSupport": "No",
    "StreamingTV": "Yes",
    "StreamingMovies": "No",
    "MonthlyCharges": 85.5,
    "TotalCharges": 1026.5
}

processed_input = preprocess_input(sample_customer)

churn_prob = model.predict_proba(processed_input)[0][1]
prediction = "Will Churn" if churn_prob > 0.5 else "Will Stay"

level, action = risk_level(churn_prob)

print(f"Prediction: {prediction}")
print(f"Churn Probability: {round(churn_prob,2)}")
print(f"Risk Level: {level}")
print(f"Suggested Action: {action}")

Prediction: Will Stay
Churn Probability: 0.47
Risk Level: Medium Risk
Suggested Action: Send Retention Email / Offer Discount


In [19]:
high_risk_customer = {
    "gender": "Male",
    "SeniorCitizen": 1,
    "Partner": "No",
    "Dependents": "No",
    "tenure": 3,
    "PhoneService": "Yes",
    "PaperlessBilling": "Yes",
    "Contract": "Month-to-month",
    "PaymentMethod": "Electronic check",
    "OnlineSecurity": "No",
    "OnlineBackup": "No",
    "DeviceProtection": "No",
    "TechSupport": "No",
    "StreamingTV": "Yes",
    "StreamingMovies": "Yes",
    "MonthlyCharges": 99.9,
    "TotalCharges": 299.7
}

processed_input2 = preprocess_input(high_risk_customer)
churn_prob2 = model.predict_proba(processed_input2)[0][1]
prediction2 = "Will Churn" if churn_prob2 > 0.5 else "Will Stay"
level2, action2 = risk_level(churn_prob2)

print(f"Prediction: {prediction2}")
print(f"Churn Probability: {round(churn_prob2,2)}")
print(f"Risk Level: {level2}")
print(f"Suggested Action: {action2}")

Prediction: Will Churn
Churn Probability: 0.69
Risk Level: High Risk
Suggested Action: Call Retention Team / Premium Offer
