In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from collections import Counter
import joblib

# Load the dataset
df = pd.read_csv('preprocessed_customer_churn_data.csv')

# Display the first few rows of the dataset
print(df.head())

# Display basic information about the dataset
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Drop the 'customerID' column as it is not needed for training
df.drop(columns=['customerID'], inplace=True)

# Handle 'TotalCharges' column separately
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill NaN values in 'TotalCharges' with the median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Ensure the correct data type for 'SeniorCitizen'
df['SeniorCitizen'] = df['SeniorCitizen'].astype(int)

# Define feature columns and target column
X = df.drop('Churn', axis=1)
y = df['Churn']

# Convert the target variable to numeric
y = y.map({'Yes': 1, 'No': 0})

# Print original class distribution
print("Original class distribution:", Counter(y))

# Define the preprocessing steps for numerical and categorical features
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

# Apply SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_preprocessed, y)

# Print resampled class distribution
print("Resampled class distribution:", Counter(y_resampled))

# Split the resampled data
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Bagging": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=0.25, bootstrap=False, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

# Train models and get predictions
predictions = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    predictions[name] = model.predict(x_test)

# Calculate accuracies
accuracies = {name: accuracy_score(y_test, pred) for name, pred in predictions.items()}

# Combine predictions
all_preds = np.array(list(predictions.values())).T
combined_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=all_preds)
accuracies["Combined Model"] = accuracy_score(y_test, combined_pred)

# Print individual model accuracies
for name, acc in accuracies.items():
    print(f"{name} Accuracy: {acc:.4f}")

# Find the best model
best_model_name = max(accuracies, key=accuracies.get)
print(f"\nThe best performing model is: {best_model_name} with an accuracy of {accuracies[best_model_name]:.4f}")

# Print classification report for the best individual model
print("\nClassification Report for the Best Model:")
if best_model_name != "Combined Model":
    print(classification_report(y_test, predictions[best_model_name]))
else:
    print(classification_report(y_test, combined_pred))

# Print classification report for the combined model
print("\nClassification Report for the Combined Model:")
print(classification_report(y_test, combined_pred))

# Save the best individual model
if best_model_name != "Combined Model":
    best_model = models[best_model_name]
    joblib.dump(best_model, 'best_individual_model.pkl')
else:
    # Assuming you want to save the individual model with the best performance
    best_model = models[max(models, key=lambda k: accuracies[k])]
    joblib.dump(best_model, 'best_individual_model.pkl')

# Save the combined model pipeline
combined_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])  # RandomForest is a placeholder; adapt as needed

combined_model_pipeline.named_steps['classifier'] = BaggingClassifier(base_estimator=best_model, n_estimators=100, random_state=42)
joblib.dump(combined_model_pipeline, 'combined_model.pkl')

   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0  7590-VHVEG       0              0        1           0       1   
1  5575-GNVDE       1              0        0           0      34   
2  3668-QPYBK       1              0        0           0       2   
3  7795-CFOCW       1              0        0           0      45   
4  9237-HQITU       0              0        0           0       2   

   PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
0             0              1                0               0  ...   
1             1              0                0               2  ...   
2             1              0                0               2  ...   
3             0              1                0               2  ...   
4             1              0                1               0  ...   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
0                 0            0            0                0         0   




Random Forest Accuracy: 0.9768
Bagging Accuracy: 0.9520
Decision Tree Accuracy: 0.9381
Gradient Boosting Accuracy: 0.9528
Logistic Regression Accuracy: 0.9195
XGBoost Accuracy: 0.9675
Combined Model Accuracy: 0.9667

The best performing model is: Random Forest with an accuracy of 0.9768

Classification Report for the Best Model:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       571
           1       0.97      0.99      0.98       721

    accuracy                           0.98      1292
   macro avg       0.98      0.98      0.98      1292
weighted avg       0.98      0.98      0.98      1292


Classification Report for the Combined Model:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       571
           1       0.96      0.98      0.97       721

    accuracy                           0.97      1292
   macro avg       0.97      0.97      0.97      1292
weighted avg       0.97

['combined_model.pkl']