In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../data/ml_ready_telco.csv")

In [4]:
X = df.drop(['Churn', 'customerID'], axis=1, errors='ignore')
y = df['Churn']
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (7043, 20)
Target shape: (7043,)


In [5]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Data split complete!")

✅ Data split complete!


In [6]:
#Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize model
model = LogisticRegression(max_iter=200000)

# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
print(f"Accuracy:{accuracy_score(y_test, y_pred):.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy:0.81%

Confusion Matrix:
 [[935 100]
 [174 200]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.67      0.53      0.59       374

    accuracy                           0.81      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [7]:
#Step 4️⃣: Feature Importance / Coefficients
import numpy as np

coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False).round(2)

coefficients


Unnamed: 0,Feature,Coefficient
15,PaperlessBilling,0.4
7,InternetService,0.25
1,SeniorCitizen,0.17
6,MultipleLines,0.14
2,Partner,0.07
16,PaymentMethod,0.07
17,MonthlyCharges,0.06
13,StreamingMovies,0.03
12,StreamingTV,0.03
0,gender,0.01


In [12]:
df['Churn_Predicted'] = model.predict(X)

# (Optional) Add churn probability too — more useful for dashboard
df['Churn_Probability'] = model.predict_proba(X)[:, 1]
df['Churn'] = df['Churn'].map({1: 'Yes', 0: 'No'})
df['Churn_Predicted'] = df['Churn_Predicted'].map({1: 'Yes', 0: 'No'})

# Save to CSV for Power BI
df.to_csv("../data/churn_predicted_full.csv", index=False)

print("✅ churn_predicted_full.csv saved successfully!")
print(df[['Churn', 'Churn_Predicted', 'Churn_Probability']].head(10))

✅ churn_predicted_full.csv saved successfully!
  Churn Churn_Predicted  Churn_Probability
0    No             Yes           0.609206
1    No              No           0.048117
2   Yes              No           0.353702
3    No              No           0.034834
4   Yes             Yes           0.685912
5   Yes             Yes           0.748794
6    No              No           0.481317
7    No              No           0.277322
8   Yes             Yes           0.538418
9    No              No           0.011963


In [11]:
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,AVGMonthlySpend,Churn_Predicted,Churn_Probability
0,7590-VHVEG,0,0,1,0,1,0,1,0,0,...,0,0,1,2,29.85,29.85,0,14.925,1,0.609206
1,5575-GNVDE,1,0,0,0,34,1,0,0,2,...,0,1,0,3,56.95,1889.5,0,53.985714,0,0.048117


In [28]:
df2 = pd.read_csv("../data/churn_predicted_full.csv")
df3 = pd.read_csv("../data/telco_customer_churn.csv")
print(df2.shape)
print(df3.shape)

df3['Churn_Predicted']= df2['Churn_Predicted']
df3['Churn_Probability']= df2['Churn_Probability']
df3['AVGMonthlySpend'] = df2['AVGMonthlySpend']
df3.to_csv('final_telco_churn.csv', index=False)

(7043, 24)
(7043, 21)


In [None]:
df2.head(2)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,AVGMonthlySpend,Churn_Predicted,Churn_Probability
0,7590-VHVEG,0,0,1,0,1,0,1,0,0,...,0,0,1,2,29.85,29.85,No,14.925,Yes,0.609206
1,5575-GNVDE,1,0,0,0,34,1,0,0,2,...,0,1,0,3,56.95,1889.5,No,53.985714,No,0.048117


In [29]:
df3.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_Predicted,Churn_Probability,AVGMonthlySpend
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,Yes,0.609206,14.925
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,One year,No,Mailed check,56.95,1889.5,No,No,0.048117,53.985714
