# Import necessary libraries

In [None]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# read excel dataset as csv because system does not allow in any other way 
csv_file_path = '../Data/Telco_customer_churn.xlsx'

try:
    data = pd.read_csv(csv_file_path)
    print("File read successfully as a CSV file.")
    
    # Check the shape of your data
    print("Shape of the data:", data.shape)
    
    # Display the first few rows of the data
    print(data.head(10))
    
except FileNotFoundError:
    print(f"The file at {csv_file_path} was not found. Please check the file path and try again.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

In [None]:
data.columns

## Data Cleaning

In [None]:
# Check for missing values
print(data.isnull().sum())


In [None]:
# Data types and basic info
print(data.info())

In [None]:
# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})


# Distribution of the target variable
sns.countplot(x='Churn Label', data=data)
plt.title('Distribution of Churn')
plt.show()

In [None]:
# Summary statistics
print(data.describe())

In [None]:
# Simplify the pairplot to focus on key features
key_features = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'Churn Value']
sns.pairplot(data[key_features + ['Churn Label']], hue='Churn Label', diag_kind='kde')
plt.show()

In [None]:
# Data Cleaning and Preparation
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Charges'] = data['Total Charges'].fillna(data['Total Charges'].median())

# Normalize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['Monthly Charges', 'Total Charges']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})

# Correct the plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Monthly Charges', y='Total Charges', hue='Churn Label', data=data)
plt.title('Monthly Charges vs. Total Charges with Churn Highlighted')
plt.xlabel('Monthly Charges')
plt.ylabel('Total Charges')
plt.legend(title='Churn Status', loc='upper left')
plt.show()


## Plot description for Tenure Months Distribution for Churn and Non-Churn Customers


This plot shows the distribution of the tenure (in months) of customers who churned (left the service) and those who did not.
 It helps to visualize the relationship between how long a customer has been with the service and their likelihood of leaving.

In [None]:
# Plot to highlight relationship between Tenure and Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn Label', y='Tenure Months', data=data)
plt.title('Tenure Months Distribution for Churn and Non-Churn Customers')
plt.xlabel('Churn Label')
plt.ylabel('Tenure Months')
plt.annotate('Low Tenure, High Churn', xy=(1, 10), xytext=(1.5, 20),
             arrowprops=dict(facecolor='blue', shrink=0.05))
plt.show()
# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})

### Plot describtion for relationship between Tenure and Churn



This plot shows how long customers have been with the company (tenure months) and whether they have canceled (churn label).

 The boxes show the distribution of the length of stay for both groups: Customers who have churned (1), and customers who have not churned (0). 
 
 The note "Low tenure, high churn" indicates that customers who have been with the company for a shorter period of time are more likely to quit.


## Plot description for Contract Type vs. Churn
This plot compares the contract types (Month-to-month, One year, Two year) and their respective churn rates. It helps to identify which contract types are more prone to customer churn.

In [None]:
# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})

# Plot to highlight impact of Contract Type on Churn using blue tones
plt.figure(figsize=(10, 6))
sns.countplot(x='Contract', hue='Churn Label', data=data, palette='Blues')
plt.title('Contract Type vs. Churn', fontsize=16, weight='bold')
plt.xlabel('Contract Type', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Adjust annotation to match the reference image
plt.annotate('Month-to-month has higher churn', xy=(0, 2100), xytext=(0.5, 2500),
             arrowprops=dict(facecolor='green', shrink=0.05), fontsize=12)

# Adjust the legend to match the reference image
plt.legend(title='Churn Status', loc='upper right')

# Ensure that the x-axis labels are correctly displayed
plt.xticks(rotation=0, fontsize=12)

# Display the plot
plt.show()

## Plot description for Monthly Charges vs. Total Charges with Churn Highlighted

This scatter plot shows the relationship between monthly charges and total charges, with churn status highlighted.

 It helps to see if there is a pattern in the charges that correlates with customer churn.

In [None]:
# Normalize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['Monthly Charges', 'Total Charges']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})

# Correct the plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Monthly Charges', y='Total Charges', hue='Churn Label', data=data)
plt.title('Monthly Charges vs. Total Charges with Churn Highlighted')
plt.xlabel('Monthly Charges')
plt.ylabel('Total Charges')
plt.legend(title='Churn Status', loc='upper left')
plt.show()

###  Plot to highlight impact of Contract Type on Churn

This plot shows how the type of contract influences the churn rate. 
Customers with month-to-month contracts have a higher churn rate compared to customers with longer contract terms (one-year or two-year contracts).

In [None]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'TotalCharges_per_Month']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Convert Churn Label to descriptive labels
data['Churn Label'] = data['Churn Label'].replace({0: 'No Churn', 1: 'Churn'})

# Plot 1: Monthly Charges vs. Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn Label', y='Monthly Charges', data=data, palette="Set2")
plt.title('Monthly Charges Distribution for Churn and Non-Churn Customers')
plt.xlabel('Churn Label')
plt.ylabel('Monthly Charges')
plt.show()

## Plot 1 Conclusion:

***You can see that customers who canceled the service had slightly higher monthly fees on average.***

In [None]:
# Plot 2: Total Charges vs. Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn Label', y='Total Charges', data=data, palette="Set1")
plt.title('Total Charges Distribution for Churn and Non-Churn Customers')
plt.xlabel('Churn Label')
plt.ylabel('Total Charges')
plt.show()

## Plot 2 Conclusion:

***Customers who canceled the service paid less overall because they were often customers for a shorter period of time.***

In [None]:
# Plot 3: Tenure vs. Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn Label', y='Tenure Months', data=data, palette="Set3")
plt.title('Tenure Distribution for Churn and Non-Churn Customers')
plt.xlabel('Churn Label')
plt.ylabel('Tenure Months')
plt.show()


## Plot 3 Conclusion:

***Customers who canceled the service often used the service for a shorter period of time than customers who stayed.***

# Data Cleaning and Feature Engineering

In [None]:
# Data Cleaning and Preparation
data['Total Charges'] = data['Total Charges'].replace(" ", np.nan)
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Charges'] = data['Total Charges'].fillna(data['Total Charges'].median())

In [None]:
# Replace zeroes with NaN to prevent division by zero
data['Tenure Months'].replace(0, np.nan, inplace=True)
data['TotalCharges_per_Month'] = data['Total Charges'] / data['Tenure Months']

In [None]:
# Replace infinite values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Impute remaining missing values
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = data.select_dtypes(include=[object]).columns.tolist()
imputer_num = SimpleImputer(strategy='median')
data[numerical_columns] = imputer_num.fit_transform(data[numerical_columns])

In [None]:
# Encoding categorical variables
le = LabelEncoder()
for column in categorical_columns:
    if column not in ['CustomerID', 'Churn Reason']:
        data[column] = le.fit_transform(data[column])

In [None]:
# Feature engineering
# Prevent division by zero by replacing zeroes with NaN in 'Tenure Months'
data['Tenure Months'].replace(0, np.nan, inplace=True)
data['TotalCharges_per_Month'] = data['Total Charges'] / data['Tenure Months']

In [None]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'TotalCharges_per_Month']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Ensure no NaN values remain in the data
print(data.isnull().sum())

# Imbalance

In [None]:
# Check for class imbalance before oversampling
sns.countplot(x='Churn Value', data=data, palette="viridis")
plt.title('Class Distribution Before Oversampling')
plt.xlabel('Churn Value')
plt.ylabel('Count')
plt.show()

# Display the counts
print(data['Churn Value'].value_counts())



In [None]:
# Define features and target variable
X = data.drop(['Churn Label', 'Churn Value'], axis=1)
y = data['Churn Value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.utils import resample  # Importing resample for oversampling and undersampling


# Oversampling the minority class
train = pd.DataFrame(X_train, columns=X_train.columns)
train['Churn Value'] = y_train.values

# Separate minority and majority classes
churn = train[train['Churn Value'] == 1]
no_churn = train[train['Churn Value'] == 0]

# Upsample minority class
churn_upsampled = resample(churn, replace=True, n_samples=len(no_churn), random_state=42)
train_upsampled = pd.concat([churn_upsampled, no_churn])

# Check the new class distribution after oversampling
sns.countplot(x='Churn Value', data=train_upsampled, palette="cubehelix")
plt.title('Class Distribution after Oversampling')
plt.xlabel('Churn Value')
plt.ylabel('Count')
plt.show()


# Prepare the data for modeling
X_train_upsampled = train_upsampled.drop(columns='Churn Value')
y_train_upsampled = train_upsampled['Churn Value']


In [None]:
pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

# SMOTE
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

# Check the new class distribution after SMOTE
sns.countplot(x=y_train_smote, palette="magma")
plt.title('Class Distribution after SMOTE')
plt.xlabel('Churn Value')
plt.ylabel('Count')
plt.show()

In [None]:
# Train the model with oversampled data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_upsampled, y_train_upsampled)

# Predictions and evaluation
y_pred = log_reg.predict(X_test)
print("Evaluation with Oversampled Data:")
print(classification_report(y_test, y_pred))



In [None]:
# Repeat for undersampled and SMOTE datasets
log_reg.fit(X_train_downsampled, y_train_downsampled)
y_pred = log_reg.predict(X_test)
print("Evaluation with Undersampled Data:")
print(classification_report(y_test, y_pred))

log_reg.fit(X_train_smote, y_train_smote)
y_pred = log_reg.predict(X_test)
print("Evaluation with SMOTE Data:")
print(classification_report(y_test, y_pred))


##  Preparing Data for Modeling

In [None]:
# Define features and target variable
X = data.drop(['Churn Label', 'Churn Value'], axis=1)
y = data['Churn Value']

#Check for any remaining NaN values in X and y
print("NaN values in X: ", X.isnull().sum().sum())
print("NaN values in y: ", y.isnull().sum())

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in X_train and X_test
X_train = imputer_num.fit_transform(X_train)
X_test = imputer_num.transform(X_test)

# Ensure no NaN values remain in the imputed data
print("NaN values in X_train after imputation: ", np.isnan(X_train).sum())
print("NaN values in X_test after imputation: ", np.isnan(X_test).sum())

##  Model Development and Initial Tuning

In [None]:
# Initial model with RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
print("Random Forest Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Advanced Modeling and Hyperparameter Tuning

In [None]:
# Gradient Boosting Model
gb_model = GradientBoostingClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 4, 5]
}

In [None]:
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [None]:
# Ensure the input data does not contain NaN values before fitting
grid_search.fit(X_train, y_train)

# The output you are seeing comes from a GridSearchCV run, which is used to find the best hyperparameters for a model. Let's break down the different parts of this output:

Fitting 3 folds for each of 12 candidates, totalling 36 fits:

***GridSearchCV is performing cross-validation by splitting the data into 3 folds.***
For each set of hyperparameters, the model is trained and evaluated three times (once for each fold).
There are 12 combinations of hyperparameters (n_estimators: 2 values, learning_rate: 2 values, max_depth: 3 values). Therefore, a total of 36 trainings and evaluations are performed (12 combinations * 3 folds).
[CV] END ...:

***These lines show*** the results of individual training runs with specific hyperparameter combinations.
For example, [CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time= 3.4s means that a model with learning_rate=0.1, max_depth=3, and n_estimators=100 was trained, and the training took 3.4 seconds.
GridSearchCV(cv=3, estimator=GradientBoostingClassifier(random_state=42), n_jobs=-1, param_grid={'learning_rate': [0.1, 0.01], 'max_depth': [3, 4, 5], 'n_estimators': [100, 200]}, verbose=2):

***This is a summary of the GridSearchCV settings.***
cv=3: Cross-validation with 3 folds.
estimator=GradientBoostingClassifier(random_state=42): The model being optimized is a Gradient Boosting Classifier.
n_jobs=-1: Use all available CPUs for training.
param_grid: The hyperparameter combinations being tried.
verbose=2: Detailed output during training.
best_estimator_: GradientBoostingClassifier:

GridSearchCV has found the best hyperparameters, and the best model is a GradientBoostingClassifier.

***Here is an example of how you can further use the results from the GridSearchCV:***

In [None]:
# Retrieve the best hyperparameters and the best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
print("Best Model Performance")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
