In [41]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("Data Folder/bigml_59c28831336c6604c800002a.csv", encoding='utf-8', index_col=0)


In [42]:
# Check column names (features)
print("Column names:")
print(df.columns.tolist())

# Basic info about the dataset
print("\nDataset info:")
print(df.info())

# Shape of the dataset (rows, columns)
print(f"\nDataset shape: {df.shape}")

# First few rows
print("\nFirst 5 rows:")
print(df.head())

Column names:
['account length', 'area code', 'phone number', 'international plan', 'voice mail plan', 'number vmail messages', 'total day minutes', 'total day calls', 'total day charge', 'total eve minutes', 'total eve calls', 'total eve charge', 'total night minutes', 'total night calls', 'total night charge', 'total intl minutes', 'total intl calls', 'total intl charge', 'customer service calls', 'churn']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 3333 entries, KS to TN
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   account length          3333 non-null   int64  
 1   area code               3333 non-null   int64  
 2   phone number            3333 non-null   object 
 3   international plan      3333 non-null   object 
 4   voice mail plan         3333 non-null   object 
 5   number vmail messages   3333 non-null   int64  
 6   total day minutes       3333 non-null   flo

In [43]:
features = ['account length', 'total day minutes', 'total day calls', 'total day charge',
           'total eve minutes', 'total eve calls', 'total eve charge',
           'total night minutes', 'total night calls', 'total night charge',
           'total intl minutes', 'total intl calls', 'total intl charge',
           'customer service calls', 'number vmail messages']


#Creating a Model Data Set and dropping missing values

In [44]:
# Convert churn to binary (1 for True, 0 for False)
df['churn_binary'] = df['churn'].astype(int)

# Handle missing values
df.dropna(inplace=True)

print(f"Dataset shape after cleaning: {df.shape}")
print(f"\nChurn distribution:")
print(df['churn'].value_counts())
print(f"Churn rate: {df['churn_binary'].mean():.3f}")

# Display first few rows
print(f"\nFirst 5 rows:")
print(df.head())

Dataset shape after cleaning: (3333, 21)

Churn distribution:
False    2850
True      483
Name: churn, dtype: int64
Churn rate: 0.145

First 5 rows:
       account length  area code phone number international plan  \
state                                                              
KS                128        415     382-4657                 no   
OH                107        415     371-7191                 no   
NJ                137        415     358-1921                 no   
OH                 84        408     375-9999                yes   
OK                 75        415     330-6626                yes   

      voice mail plan  number vmail messages  total day minutes  \
state                                                             
KS                yes                     25              265.1   
OH                yes                     26              161.6   
NJ                 no                      0              243.4   
OH                 no                  

In [47]:
# CORRELATION ANALYSIS

print("\n" + "="*60)
print("FINDING BEST PREDICTOR FOR SIMPLE MODEL")
print("="*60)

# Calculate correlations with churn
correlations = df[features + ['churn_binary']].corr()['churn_binary'].sort_values(key=abs, ascending=False)

print("Feature correlations with churn (sorted by absolute value):")
for feature, corr in correlations.items():
    if feature != 'churn_binary':
        print(f"{feature:<25}: {corr:>8.4f}")

# Find the feature most correlated with churn
feature_corrs = correlations.drop('churn_binary')
best_feature = feature_corrs.abs().idxmax()
print(f"\nBest single predictor: {best_feature} (correlation: {feature_corrs[best_feature]:.4f})")



FINDING BEST PREDICTOR FOR SIMPLE MODEL
Feature correlations with churn (sorted by absolute value):
customer service calls   :   0.2087
total day minutes        :   0.2052
total day charge         :   0.2052
total eve minutes        :   0.0928
total eve charge         :   0.0928
number vmail messages    :  -0.0897
total intl charge        :   0.0683
total intl minutes       :   0.0682
total intl calls         :  -0.0528
total night charge       :   0.0355
total night minutes      :   0.0355
total day calls          :   0.0185
account length           :   0.0165
total eve calls          :   0.0092
total night calls        :   0.0061

Best single predictor: customer service calls (correlation: 0.2087)


In [46]:
# 1. SIMPLE LOGISTIC REGRESSION MODEL

print("\n" + "="*60)
print("1. BUILD SIMPLE LOGISTIC REGRESSION MODEL")
print("="*60)

print(f"Building simple logistic regression using '{best_feature}'")
print("The feature with highest correlation with churn will be used for simple logistic regression.")

# Prepare data for simple logistic regression
exog_simple = sm.add_constant(df[[best_feature]])
endog = df['churn_binary']

# Build simple logistic regression model
simple_model = sm.Logit(endog, exog_simple)
simple_model_results = simple_model.fit()

print(f"\nSimple Logistic Regression Results:")
print("="*50)
print(simple_model_results.summary())


1. BUILD SIMPLE LOGISTIC REGRESSION MODEL
Building simple logistic regression using 'customer service calls'
The feature with highest correlation with churn will be used for simple logistic regression.
Optimization terminated successfully.
         Current function value: 0.394125
         Iterations 6

Simple Logistic Regression Results:
                           Logit Regression Results                           
Dep. Variable:           churn_binary   No. Observations:                 3333
Model:                          Logit   Df Residuals:                     3331
Method:                           MLE   Df Model:                            1
Date:                Mon, 21 Jul 2025   Pseudo R-squ.:                 0.04751
Time:                        13:59:23   Log-Likelihood:                -1313.6
converged:                       True   LL-Null:                       -1379.1
Covariance Type:            nonrobust   LLR p-value:                 2.404e-30
                          