In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
data=pd.read_csv("Telco-Customer-Churn.csv")
df=pd.DataFrame(data)

In [None]:
#analyzing  and cleaning  data for model
df.head()

In [None]:
df.tail()

In [None]:
df.info()
df.isnull().sum()


In [None]:
df.describe()

In [None]:
df

In [None]:
df["TotalCharges"]=pd.to_numeric(df["TotalCharges"],errors="coerce")

In [None]:
df.isnull().sum()

In [None]:
df["TotalCharges"]= df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])


In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:

#converting all yes no values to 0 and 1  
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})


service_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in service_cols:
    df[col] = df[col].replace({'No phone service': 'No', 'No internet service': 'No'})
    df[col] = df[col].map({'Yes': 1, 'No': 0})
# encoding categorical values after train test split
categorical_col=['gender','Contract','InternetService','PaymentMethod']



In [None]:
df.drop(["customerID"], axis=1, inplace=True ) 


In [None]:
#feature engineering
df['ChargesPerMonth'] = df['TotalCharges'] / (df['tenure'] + 1)
df['HasMultipleServices'] = (df['OnlineSecurity'] == 'Yes').astype(int) + \
                             (df['DeviceProtection'] == 'Yes').astype(int) + \
                             (df['TechSupport'] == 'Yes').astype(int)



In [None]:
df

In [None]:
df["Churn"].value_counts().sort_index().plot.bar(grid="true")
plt.title("Churn distribution")
plt.xlabel("churn")
plt.ylabel("numbers of instances")


In [None]:
# test train split
x = df.drop('Churn', axis=1)
y = df['Churn']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)


In [None]:
y_test.value_counts().sort_index().plot.bar(grid="true")
plt.title("Churn distribution")
plt.xlabel("churn")
plt.ylabel("numbers of instances")

In [None]:
# encoding  categorical columns
Encoder = OneHotEncoder(drop='first', sparse_output=False )
Encoder.fit(x_train[categorical_col])

x_train_encoded=Encoder.transform(x_train[categorical_col])
x_test_encoded=Encoder.transform(x_test[categorical_col])

encoded_col=Encoder.get_feature_names_out(categorical_col)
x_train_ohe=pd.DataFrame(x_train_encoded,columns=encoded_col, index=x_train.index)
x_test_ohe=pd.DataFrame(x_test_encoded,columns=encoded_col, index=x_test.index)


X_train = x_train.drop(categorical_col, axis=1)
X_test = x_test.drop(categorical_col, axis=1)

X_train = pd.concat([X_train, x_train_ohe], axis=1)
X_test = pd.concat([X_test, x_test_ohe], axis=1)



In [None]:
X_train

In [None]:
X_train.columns.tolist()



In [None]:
# scaling the numerical columns
scaler= StandardScaler()
numeric_cols = ['tenure','TotalCharges','MonthlyCharges','ChargesPerMonth']
scaler.fit(X_train[numeric_cols])
X_train[numeric_cols]=scaler.transform(X_train[numeric_cols])
X_test[numeric_cols]=scaler.transform(X_test[numeric_cols])



In [None]:
X_train.columns.tolist()

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train.value_counts()

In [None]:
assert X_train.shape[1] == X_test.shape[1], "Feature mismatch!"
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# now training model with data
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,root_mean_squared_error, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

model1 = LogisticRegression(max_iter=1000,class_weight='balanced')
model2 = RandomForestClassifier(n_estimators=300,
                                max_depth=10,
                                min_samples_split=5,
                                class_weight='balanced')
model3 = DecisionTreeClassifier()
model4 = XGBClassifier(eval_metric='logloss')


model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)
model4.fit(X_train,y_train)


probs = model1.predict_proba(X_test)[:,1]
pred = (probs >= 0.45).astype(int)



pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)
pred4 = model4.predict(X_test)

acc1 = accuracy_score(y_test, pred1)
acc2 = accuracy_score(y_test, pred2)
acc3 = accuracy_score(y_test, pred3)
acc4 = accuracy_score(y_test, pred4)
print(f"logi_acc{acc1},random_acc{acc2},dc_acc[{acc3},xgb_acc{acc4}")

precision1 = precision_score(y_test, pred1)
precision2= precision_score(y_test, pred2)
precision3= precision_score(y_test, pred3)
precision4= precision_score(y_test, pred4)
print(f"logi_precision{precision1},random_precision{precision2},dc_precision{precision3},xgb_precision{precision4}")

recall1 = recall_score(y_test, pred1)
recall2= recall_score(y_test, pred2)
recall3= recall_score(y_test, pred3)
recall4= recall_score(y_test, pred4)
print(f"logi_recall{recall1},random_recall{recall2},dc_recall{recall3},xgb_recall{recall4}")


f11 = f1_score(y_test, pred1)
f12= f1_score(y_test, pred2)
f13= f1_score(y_test, pred3)
f14= f1_score(y_test, pred4)
print(f"logi_f1{f11},random_f1{f12},dc_f1{f13},xgb_f1{f14}")

roc1= roc_auc_score(y_test, model1.predict_proba(X_test)[:,1])
roc2= roc_auc_score(y_test, model2.predict_proba(X_test)[:,1])
roc3= roc_auc_score(y_test, model3.predict_proba(X_test)[:,1])
roc4= roc_auc_score(y_test, model4.predict_proba(X_test)[:,1])
print("model1_roc",roc1)
print("model2_roc",roc2)
print("model3_roc",roc3)
print("model4_roc",roc4)



# model = LogisticRegression(max_iter=1000, class_weight='balanced')
# model.fit(X_train, y_train)
# 

# # Optional threshold tuning


# print("Accuracy:", accuracy_score(y_test, pred))
# print("Precision:", precision_score(y_test, pred))
# print("Recall:", recall_score(y_test, pred))
# print("F1:", f1_score(y_test, pred))
# print("AUC:", roc_auc_score(y_test, probs))

In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

# --- Confusion Matrix ---
fig, ax = plt.subplots(figsize=(6, 5))
ConfusionMatrixDisplay.from_predictions(y_test, pred1, ax=ax, cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# --- ROC Curve ---
fig, ax = plt.subplots(figsize=(6, 5))
RocCurveDisplay.from_predictions(y_test, probs, ax=ax)
plt.title("ROC Curve")
plt.plot([0,1],[0,1],'--',color='grey')
plt.show()


In [None]:
import joblib
#training model
joblib.dump(model1,"model.pkl")
#scaler model
joblib.dump(scaler,"scaler.pkl")
#encoder model
joblib.dump(Encoder,"encoder.pkl")


In [None]:
joblib.dump(X_test, "X_test.pkl")
joblib.dump(y_test, "y_test.pkl")
