# Introduction

This dataset was scraped from kaggle.com on "Telco Customer Churn"

## Import Statements

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('Telco_Customer_Churn.csv')

# Display basic information about the dataset
print("Dataset Info:")
print("-" * 50)
df.info()

print("\nFirst few rows of the dataset:")
print("-" * 50)
display(df.head())

print("\nBasic statistics:")
print("-" * 50)
display(df.describe())

# Data Preprocessing and Cleaning

In [None]:
# Check for missing values
print("Missing values in each column:")
print("-" * 50)
display(df.isnull().sum())

# Check for duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Handle missing values (if any)
df = df.fillna(method='ffill')  # Forward fill - adjust method based on your needs

# Convert categorical variables to numerical
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if col != 'customerID':  # Exclude ID column
        print(f"\nUnique values in {col}:")
        print(df[col].value_counts())

# Convert binary categorical variables to numerical
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Handle other categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df, columns=[col for col in categorical_columns if col != 'customerID' and col != 'Churn'])

# Display the transformed dataset
print("\nTransformed dataset shape:", df_encoded.shape)
display(df_encoded.head())

# Exploratory Data Analysis (EDA)

In [None]:
# Set up plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

# 1. Churn Distribution
plt.figure(figsize=(10, 6))
churn_dist = df['Churn'].value_counts()
sns.barplot(x=churn_dist.index, y=churn_dist.values)
plt.title('Distribution of Customer Churn')
plt.xlabel('Churn Status')
plt.ylabel('Count')
plt.show()

# 2. Numerical Features Distribution
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    if col != 'Churn':
        plt.subplot(3, 3, i)
        sns.histplot(data=df, x=col, hue='Churn', multiple="stack")
        plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# 3. Correlation Analysis
correlation_matrix = df_encoded.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Features')
plt.show()

# 4. Categorical Features Analysis
categorical_cols = df.select_dtypes(include=['object']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols[:6], 1):
    if col != 'customerID':
        plt.subplot(2, 3, i)
        sns.countplot(data=df, x=col, hue='Churn')
        plt.xticks(rotation=45)
        plt.title(f'{col} vs Churn')
plt.tight_layout()
plt.show()

# Feature Engineering and Model Preparation

In [None]:
# Import required ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Prepare features and target
X = df_encoded.drop(['customerID', 'Churn'], axis=1)
y = df_encoded['Churn']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n{name} Results:")
    print("-" * 50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()