In [11]:
import pandas as pd

# Load the dataset
data = pd.read_csv('customer_churn_data.csv')

# Display data types of each column
print(data.dtypes)


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
data = pd.read_csv('customer_churn_data.csv')

# Display the first few rows of the dataset
print(data.head())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [14]:
print(data.isnull().sum())

# Check column names
print(data.columns)

# List of categorical columns to fill missing values and encode
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


In [15]:
for column in categorical_columns:
    if column in data.columns:
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        print(f"Column '{column}' does not exist in the dataset")


In [16]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# List of numerical columns to fill missing values
numerical_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [17]:
for column in numerical_columns:
    if column in data.columns:
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        print(f"Column '{column}' does not exist in the dataset")

In [18]:
label_encoders = {}
for column in categorical_columns:
    if column in data.columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))
        label_encoders[column] = le

In [19]:
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})


In [20]:
X = data.drop(columns=['customerID', 'Churn'])
y = data['Churn']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [23]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

model_performance = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_performance[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred)
    }

In [24]:
performance_df = pd.DataFrame(model_performance).T
print(performance_df)


                     Accuracy  Precision    Recall  F1 Score   ROC AUC
Logistic Regression  0.816182   0.678125  0.581769  0.626263  0.741174
Random Forest        0.796309   0.664122  0.466488  0.548031  0.690773
Gradient Boosting    0.804116   0.667820  0.517426  0.583082  0.712381


In [25]:
best_model_name = performance_df['ROC AUC'].idxmax()
best_model = models[best_model_name]


In [26]:
new_data = X_test[:5]  # Replace this with your new data
predictions = best_model.predict(new_data)
print(f'Best model: {best_model_name}')
print('Predictions on new data:', predictions)


Best model: Logistic Regression
Predictions on new data: [1 0 0 1 0]
