# Customer Churn Prediction - Data Loading & Initial Exploration

**Project Overview**: Predicting customer churn for a telecommunications company

**Author**: Muhammad Afnan

**Date**: 11 Jun 2025

---

## Objective
This notebook focuses on:
1. Loading the dataset
2. Initial data exploration
3. Basic data quality assessment
4. Understanding data structure and types

---

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully")

## 2. Data Loading

In [None]:
# Load the dataset
# Note: Update the path according to your local setup
DATA_PATH = "D:\Machine Learning Projects\Customer Churn Prediction\dataset\Telco-Customer-Churn.csv"

try:
    df = pd.read_csv(DATA_PATH)
    print(f"✅ Dataset loaded successfully")
    print(f"📊 Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"❌ File not found at {DATA_PATH}")
    print("Please update the DATA_PATH variable with the correct path to your dataset")

## 3. Initial Data Exploration

In [None]:
# First look at the data
print("📋 First 5 rows of the dataset:")
print("=" * 50)
df.head()

In [None]:
# Dataset information
print("📈 Dataset Information:")
print("=" * 30)
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")
print(f"\nColumn names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

In [None]:
# Data types and memory usage
print("🔍 Data Types and Memory Usage:")
print("=" * 35)
df.info()

## 4. Data Quality Assessment

In [None]:
# Check for missing values
print("🔍 Missing Values Analysis:")
print("=" * 30)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
else:
    print("✅ No missing values found in the dataset")

In [None]:
# Check for duplicate rows
print("🔍 Duplicate Rows Analysis:")
print("=" * 30)
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print(f"Percentage of duplicates: {(duplicate_count/len(df))*100:.2f}%")
else:
    print("✅ No duplicate rows found")

In [None]:
# Check unique values for each column
print("🔍 Unique Values Analysis:")
print("=" * 30)
unique_values_df = pd.DataFrame({
    'Column': df.columns,
    'Unique Values': [df[col].nunique() for col in df.columns],
    'Data Type': df.dtypes
})

print(unique_values_df.to_string(index=False))

## 5. Basic Statistical Summary

In [None]:
# Statistical summary for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("📊 Statistical Summary (Numerical Columns):")
print("=" * 80)
df.describe().T


In [None]:
# Summary for categorical columns
print("📊 Categorical Columns Analysis:")
print("=" * 35)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    print(f"\n🔸 {col}:")
    value_counts = df[col].value_counts()
    print(value_counts)
    print(f"Unique values: {df[col].nunique()}")

## 6. Target Variable Analysis

In [None]:
# Analyze the target variable (Churn)
print("🎯 Target Variable Analysis (Churn):")
print("=" * 40)

if 'Churn' in df.columns:
    churn_counts = df['Churn'].value_counts()
    churn_percentages = df['Churn'].value_counts(normalize=True) * 100
    
    target_summary = pd.DataFrame({
        'Count': churn_counts,
        'Percentage': churn_percentages.round(2)
    })
    
    print(target_summary)
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Count plot
    churn_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'salmon'])
    ax1.set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Churn')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=0)
    
    # Add value labels on bars
    for i, v in enumerate(churn_counts.values):
        ax1.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')
    
    # Pie chart
    ax2.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', 
            colors=['skyblue', 'salmon'], startangle=90)
    ax2.set_title('Churn Distribution (Percentage)', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Class imbalance check
    minority_class_pct = min(churn_percentages)
    if minority_class_pct < 30:
        print(f"\n⚠️ Class imbalance detected! Minority class: {minority_class_pct:.1f}%")
        print("Consider using sampling techniques during model training.")
    else:
        print(f"\n✅ Balanced dataset. Minority class: {minority_class_pct:.1f}%")
else:
    print("❌ 'Churn' column not found in the dataset")

## 7. Data Cleaning

In [None]:
# Clean TotalCharges column
print("🧹 Cleaning TotalCharges column:")
print("=" * 35)
l1= []
l2 = []
for i in (df['TotalCharges']):
    length = len(i.split())
    l1.append(length)
for i in range(len(l1)):
    if l1[i] != 1:
        l2.append(i)
for i in l2:
    df.loc[i,'TotalCharges'] = df.loc[(i-1),'TotalCharges']
df['TotalCharges'] = df['TotalCharges'].astype(float)
print(f"✅ TotalCharges cleaning completed")

In [None]:
# Create a copy for processing
df_clean = df.copy()
print(f"📋 Working with a copy of the data")

In [None]:
# Remove CustomerID as it's not useful for prediction
print("🗑️ Removing CustomerID column:")
print("=" * 30)

if 'customerID' in df_clean.columns:
    print(f"CustomerID samples: {df_clean['customerID'].head().tolist()}")
    df_clean = df_clean.drop(columns=['customerID'])
    print("✅ CustomerID column removed")
else:
    print("ℹ️ CustomerID column not found")

print(f"New dataset shape: {df_clean.shape}")

## 8. Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

text_data_features = []
for i in list(df_clean.columns):
    if i not in list(df.describe().columns):
        text_data_features.append(i)
print("Label Encoder Transformation")

for i in text_data_features:
    df_clean[i] = le.fit_transform(df_clean[i])
    print(i, ' : ',df_clean[i].unique(),' = ' ,le.inverse_transform(df_clean[i].unique()))

In [None]:
colors = ['Orange', "Black"]
churn = df_clean[df_clean['Churn']==1].describe().T
not_churn = df_clean[df_clean['Churn']==0].describe().T

fig,ax = plt.subplots(nrows = 1,ncols = 2,figsize = (5,5))
plt.subplot(1,2,1)
sns.heatmap(churn[['mean']],annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f')
plt.title('Churned Customers');

plt.subplot(1,2,2)
sns.heatmap(not_churn[['mean']],annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f',)
plt.title('Not_Churned Customers');
fig.tight_layout(pad = 0)

## 7. Summary & Next Steps

In [None]:
print("📋 DATA EXPLORATION SUMMARY")
print("=" * 50)
print(f"Dataset Shape: {df.shape}")
print(f"Missing Values: {df.isnull().sum().sum()}")
print(f"Duplicate Rows: {df.duplicated().sum()}")
print(f"Numerical Columns: {len(numerical_cols)}")
print(f"Categorical Columns: {len(categorical_cols)}")

if 'Churn' in df.columns:
    churn_rate = (df['Churn'] == 'Yes').mean() * 100
    print(f"Churn Rate: {churn_rate:.1f}%")

print("\n📝 NEXT STEPS:")
print("=" * 15)
print("1. ✅ Data Loading & Exploration & Cleaning - COMPLETED")
print("2. 🔄 Exploratory Data Analysis (EDA)")
print("3. 🔄 Feature Engineering")

# Save the loaded data for next notebook
print("\n💾 Saving processed data for next notebook...")
df.to_csv('Telco-Customer-Churn.csv', index=False)
print("✅ Data saved to 'Telco-Customer-Churn.csv'")
df_clean.to_csv('Telco-Customer-Churn-Cleaned.csv', index=False)
print("✅ Data saved to 'Telco-Customer-Churn-Cleaned.csv'")