In [8]:
import pandas as pd
df = pd.read_csv("cleaned_churn_data.csv")
df

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn,Spend,Spend_zscore
0,22.0,0,25.0,14,4,27.0,0,1,598.0,9,1,Low,0.222105
1,41.0,0,28.0,28,7,13.0,2,1,584.0,20,0,Low,0.176779
2,47.0,1,27.0,10,2,29.0,1,0,757.0,21,0,Medium,0.830634
3,35.0,1,9.0,12,5,17.0,1,2,232.0,18,0,Low,-1.209320
4,53.0,0,58.0,24,9,2.0,2,0,533.0,18,0,Low,-0.020179
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64369,45.0,0,33.0,12,6,21.0,0,2,947.0,14,1,Medium,1.573050
64370,37.0,1,6.0,1,5,22.0,2,0,923.0,9,1,Medium,1.485970
64371,25.0,1,39.0,14,8,30.0,1,1,327.0,20,1,Low,-0.840185
64372,50.0,0,18.0,19,7,22.0,2,1,540.0,13,1,Low,0.006854


In [9]:
# training before poly
import statsmodels.api as sm

y = df['Churn']
X = df.drop(columns=['Churn', 'Spend'])
X = X.apply(pd.to_numeric, errors='coerce')
X = sm.add_constant(X)

logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.400352
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                64374
Model:                          Logit   Df Residuals:                    64362
Method:                           MLE   Df Model:                           11
Date:                Sat, 02 Aug 2025   Pseudo R-squ.:                  0.4213
Time:                        17:04:49   Log-Likelihood:                -25772.
converged:                       True   LL-Null:                       -44531.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 3.1582      3.121      1.012      0.312      -2.960       9.276
Age     

In [10]:
from sklearn.metrics import accuracy_score

y_pred_probs = logit_model.predict(X)
y_pred_labels = (y_pred_probs > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.24%


In [11]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
num_cols = ['Age', 'Tenure', 'Payment Delay']
poly_features = poly.fit_transform(df[num_cols])
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(num_cols))

df = pd.concat([df.reset_index(drop=True), poly_df], axis=1)
df

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,...,Spend_zscore,Age.1,Tenure.1,Payment Delay.1,Age^2,Age Tenure,Age Payment Delay,Tenure^2,Tenure Payment Delay,Payment Delay^2
0,22.0,0,25.0,14,4,27.0,0,1,598.0,9,...,0.222105,22.0,25.0,27.0,484.0,550.0,594.0,625.0,675.0,729.0
1,41.0,0,28.0,28,7,13.0,2,1,584.0,20,...,0.176779,41.0,28.0,13.0,1681.0,1148.0,533.0,784.0,364.0,169.0
2,47.0,1,27.0,10,2,29.0,1,0,757.0,21,...,0.830634,47.0,27.0,29.0,2209.0,1269.0,1363.0,729.0,783.0,841.0
3,35.0,1,9.0,12,5,17.0,1,2,232.0,18,...,-1.209320,35.0,9.0,17.0,1225.0,315.0,595.0,81.0,153.0,289.0
4,53.0,0,58.0,24,9,2.0,2,0,533.0,18,...,-0.020179,53.0,58.0,2.0,2809.0,3074.0,106.0,3364.0,116.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64369,45.0,0,33.0,12,6,21.0,0,2,947.0,14,...,1.573050,45.0,33.0,21.0,2025.0,1485.0,945.0,1089.0,693.0,441.0
64370,37.0,1,6.0,1,5,22.0,2,0,923.0,9,...,1.485970,37.0,6.0,22.0,1369.0,222.0,814.0,36.0,132.0,484.0
64371,25.0,1,39.0,14,8,30.0,1,1,327.0,20,...,-0.840185,25.0,39.0,30.0,625.0,975.0,750.0,1521.0,1170.0,900.0
64372,50.0,0,18.0,19,7,22.0,2,1,540.0,13,...,0.006854,50.0,18.0,22.0,2500.0,900.0,1100.0,324.0,396.0,484.0


In [12]:
correlation_matrix = df.corr(numeric_only=True)
correlation_matrix['Churn'].sort_values(ascending=False)

Churn                   1.000000
Payment Delay           0.551988
Payment Delay           0.551988
Payment Delay^2         0.541521
Tenure Payment Delay    0.513790
Age Payment Delay       0.469073
Support Calls           0.304631
Tenure                  0.192083
Tenure                  0.192083
Age Tenure              0.185867
Tenure^2                0.171232
Age^2                   0.066217
Age                     0.062874
Age                     0.062874
Last Interaction       -0.002818
Subscription Type      -0.007738
Contract Length        -0.017098
Spend_zscore           -0.077340
Total Spend            -0.077400
Usage Frequency        -0.115098
Gender                 -0.164549
Name: Churn, dtype: float64

In [13]:
# training after adding poly
import statsmodels.api as sm

y = df['Churn']
X = df.drop(columns=['Churn', 'Spend'])
X = X.apply(pd.to_numeric, errors='coerce')
X = sm.add_constant(X)

logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.383023
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                64374
Model:                          Logit   Df Residuals:                    64356
Method:                           MLE   Df Model:                           17
Date:                Sat, 02 Aug 2025   Pseudo R-squ.:                  0.4463
Time:                        17:04:58   Log-Likelihood:                -24657.
converged:                       True   LL-Null:                       -44531.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    2.3921      2.104      1.137      0.255      -1.731       6.515

In [14]:
from sklearn.metrics import accuracy_score

y_pred_probs = logit_model.predict(X)
y_pred_labels = (y_pred_probs > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.69%
