In [2]:
import pandas as pd
df = pd.read_csv('../data/01_raw/telco-customer-churn.csv')
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [3]:
# Basic Data Exploration
print("=" * 50)
print("DATASET SHAPE")
print("=" * 50)
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print()
print("=" * 50)
print("COLUMN NAMES")
print("=" * 50)
print(df.columns.tolist())

DATASET SHAPE
Rows: 7043, Columns: 21

COLUMN NAMES
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [4]:
# Data Types Overview
print("=" * 50)
print("DATA TYPES INFO")
print("=" * 50)
df.info()

DATA TYPES INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non

In [5]:
# Detailed Data Types
print("=" * 50)
print("DATA TYPES BY COLUMN")
print("=" * 50)
print(df.dtypes)
print()
print("Data type counts:")
print(df.dtypes.value_counts())

DATA TYPES BY COLUMN
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Data type counts:
object     18
int64       2
float64     1
Name: count, dtype: int64


In [6]:
# ============================================
# MISSING VALUES ANALYSIS
# ============================================
print("=" * 50)
print("MISSING VALUES COUNT")
print("=" * 50)
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])
if missing_df['Missing Count'].sum() == 0:
    print("No explicit NULL values found!")
print()
print(f"Total missing values: {missing.sum()}")

MISSING VALUES COUNT
Empty DataFrame
Columns: [Missing Count, Missing %]
Index: []
No explicit NULL values found!

Total missing values: 0


In [7]:
# Check for hidden missing values (empty strings, whitespace, special values)
print("=" * 50)
print("HIDDEN MISSING VALUES CHECK")
print("=" * 50)
for col in df.columns:
    if df[col].dtype == 'object':
        # Check for empty strings or whitespace
        empty_count = (df[col].str.strip() == '').sum()
        whitespace_count = df[col].str.isspace().sum()
        if empty_count > 0 or whitespace_count > 0:
            print(f"{col}: {empty_count} empty strings, {whitespace_count} whitespace-only")
        
        # Check unique values for potential missing value indicators
        unique_vals = df[col].unique()
        suspicious = [v for v in unique_vals if str(v).lower() in ['na', 'n/a', 'null', 'none', '-', '']]
        if suspicious:
            print(f"{col}: Potential missing indicators found: {suspicious}")

print("\nChecking TotalCharges (often has issues):")
print(f"TotalCharges dtype: {df['TotalCharges'].dtype}")
print(f"Sample values: {df['TotalCharges'].head(10).tolist()}")

HIDDEN MISSING VALUES CHECK
TotalCharges: 11 empty strings, 11 whitespace-only

Checking TotalCharges (often has issues):
TotalCharges dtype: object
Sample values: ['29.85', '1889.5', '108.15', '1840.75', '151.65', '820.5', '1949.4', '301.9', '3046.05', '3487.95']


In [8]:
# ============================================
# DUPLICATE RECORDS ANALYSIS
# ============================================
print("=" * 50)
print("DUPLICATE ANALYSIS")
print("=" * 50)

# Check for duplicate rows
dup_rows = df.duplicated().sum()
print(f"Complete duplicate rows: {dup_rows}")

# Check for duplicate customer IDs (should be unique)
dup_ids = df['customerID'].duplicated().sum()
print(f"Duplicate customerID values: {dup_ids}")

if dup_rows > 0:
    print("\nDuplicate rows preview:")
    print(df[df.duplicated(keep=False)].head(10))

DUPLICATE ANALYSIS
Complete duplicate rows: 0
Duplicate customerID values: 0


In [9]:
# ============================================
# DATA TYPES ANALYSIS & CONVERSION NEEDS
# ============================================
print("=" * 50)
print("DATA TYPE CONVERSION ANALYSIS")
print("=" * 50)

# Identify columns by current dtype
object_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Object columns ({len(object_cols)}): {object_cols}")
print(f"\nNumeric columns ({len(numeric_cols)}): {numeric_cols}")

# Check if TotalCharges should be numeric
print("\n--- TotalCharges Analysis ---")
print(f"Current dtype: {df['TotalCharges'].dtype}")
# Try converting to numeric
tc_numeric = pd.to_numeric(df['TotalCharges'], errors='coerce')
conversion_issues = tc_numeric.isnull().sum() - df['TotalCharges'].isnull().sum()
print(f"Values that fail numeric conversion: {conversion_issues}")
if conversion_issues > 0:
    # Find problematic values
    mask = pd.to_numeric(df['TotalCharges'], errors='coerce').isnull() & df['TotalCharges'].notna()
    print(f"Problematic values: {df.loc[mask, 'TotalCharges'].unique()}")

DATA TYPE CONVERSION ANALYSIS
Object columns (18): ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']

Numeric columns (3): ['SeniorCitizen', 'tenure', 'MonthlyCharges']

--- TotalCharges Analysis ---
Current dtype: object
Values that fail numeric conversion: 11
Problematic values: [' ']


In [10]:
# Identify binary columns that could be boolean
print("\n--- Binary Columns Analysis ---")
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count == 2:
        print(f"{col}: {df[col].unique()}")


--- Binary Columns Analysis ---
gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
PaperlessBilling: ['Yes' 'No']
Churn: ['No' 'Yes']


In [11]:
# ============================================
# CATEGORICAL VARIABLES ANALYSIS
# ============================================
print("=" * 50)
print("CATEGORICAL VARIABLES ANALYSIS")
print("=" * 50)

# Exclude customerID and numeric columns
cat_cols = [col for col in df.select_dtypes(include=['object']).columns 
            if col != 'customerID']

print(f"Categorical columns to analyze: {cat_cols}\n")

for col in cat_cols:
    print(f"\n--- {col} ---")
    print(f"Unique values: {df[col].nunique()}")
    print("Value counts:")
    print(df[col].value_counts())
    print()

CATEGORICAL VARIABLES ANALYSIS
Categorical columns to analyze: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']


--- gender ---
Unique values: 2
Value counts:
gender
Male      3555
Female    3488
Name: count, dtype: int64


--- Partner ---
Unique values: 2
Value counts:
Partner
No     3641
Yes    3402
Name: count, dtype: int64


--- Dependents ---
Unique values: 2
Value counts:
Dependents
No     4933
Yes    2110
Name: count, dtype: int64


--- PhoneService ---
Unique values: 2
Value counts:
PhoneService
Yes    6361
No      682
Name: count, dtype: int64


--- MultipleLines ---
Unique values: 3
Value counts:
MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64


--- InternetService ---
Unique values: 3
Value count

In [12]:
# Categorize variables by cardinality for encoding strategy
print("=" * 50)
print("ENCODING STRATEGY RECOMMENDATIONS")
print("=" * 50)

binary_cols = []
low_card_cols = []  # 3-5 unique values
high_card_cols = []  # > 5 unique values

for col in cat_cols:
    n_unique = df[col].nunique()
    if n_unique == 2:
        binary_cols.append(col)
    elif n_unique <= 5:
        low_card_cols.append(col)
    else:
        high_card_cols.append(col)

print(f"Binary (Label Encoding suitable): {binary_cols}")
print(f"Low cardinality (One-Hot suitable): {low_card_cols}")
print(f"High cardinality (Consider Target/Frequency encoding): {high_card_cols}")

ENCODING STRATEGY RECOMMENDATIONS
Binary (Label Encoding suitable): ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
Low cardinality (One-Hot suitable): ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
High cardinality (Consider Target/Frequency encoding): ['TotalCharges']


In [13]:
# ============================================
# NUMERICAL VARIABLES ANALYSIS
# ============================================
print("=" * 50)
print("NUMERICAL VARIABLES ANALYSIS")
print("=" * 50)

# Get numeric columns (excluding SeniorCitizen which is binary)
num_cols = ['tenure', 'MonthlyCharges']
# TotalCharges needs conversion first, so we'll handle it separately

print(f"Numerical columns: {num_cols}")
print()
print(df[num_cols].describe())

NUMERICAL VARIABLES ANALYSIS
Numerical columns: ['tenure', 'MonthlyCharges']

            tenure  MonthlyCharges
count  7043.000000     7043.000000
mean     32.371149       64.761692
std      24.559481       30.090047
min       0.000000       18.250000
25%       9.000000       35.500000
50%      29.000000       70.350000
75%      55.000000       89.850000
max      72.000000      118.750000


In [14]:
# Distribution Analysis - Skewness and Outliers
import numpy as np

print("=" * 50)
print("DISTRIBUTION & OUTLIER ANALYSIS")
print("=" * 50)

for col in num_cols:
    print(f"\n--- {col} ---")
    skewness = df[col].skew()
    print(f"Skewness: {skewness:.3f}")
    if abs(skewness) < 0.5:
        print("  -> Approximately symmetric")
    elif abs(skewness) < 1:
        print("  -> Moderately skewed")
    else:
        print("  -> Highly skewed (consider transformation)")
    
    # Outlier detection using IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    print(f"IQR bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
    print(f"Outliers count: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")

DISTRIBUTION & OUTLIER ANALYSIS

--- tenure ---
Skewness: 0.240
  -> Approximately symmetric
IQR bounds: [-60.00, 124.00]
Outliers count: 0 (0.00%)

--- MonthlyCharges ---
Skewness: -0.221
  -> Approximately symmetric
IQR bounds: [-46.02, 171.38]
Outliers count: 0 (0.00%)


In [15]:
# Scaling Analysis - Check if features are on different scales
print("=" * 50)
print("SCALING ANALYSIS")
print("=" * 50)
print("\nFeature ranges:")
for col in num_cols:
    print(f"{col}: min={df[col].min()}, max={df[col].max()}, range={df[col].max() - df[col].min()}")
print("\n-> Features are on different scales. StandardScaler or MinMaxScaler recommended.")

SCALING ANALYSIS

Feature ranges:
tenure: min=0, max=72, range=72
MonthlyCharges: min=18.25, max=118.75, range=100.5

-> Features are on different scales. StandardScaler or MinMaxScaler recommended.


In [16]:
# ============================================
# TARGET VARIABLE ANALYSIS
# ============================================
print("=" * 50)
print("TARGET VARIABLE (Churn) ANALYSIS")
print("=" * 50)

print("\nValue counts:")
print(df['Churn'].value_counts())

print("\nPercentage distribution:")
churn_pct = df['Churn'].value_counts(normalize=True) * 100
print(churn_pct)

# Class imbalance check
majority_class = churn_pct.max()
minority_class = churn_pct.min()
imbalance_ratio = majority_class / minority_class

print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print("-> Significant class imbalance detected!")
    print("-> Consider: SMOTE, class weights, or undersampling")
elif imbalance_ratio > 1.5:
    print("-> Moderate class imbalance")
    print("-> Consider: class weights in model training")
else:
    print("-> Classes are relatively balanced")

TARGET VARIABLE (Churn) ANALYSIS

Value counts:
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Percentage distribution:
Churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64

Class imbalance ratio: 2.77:1
-> Moderate class imbalance
-> Consider: class weights in model training


In [17]:
# ============================================
# PREPROCESSING RECOMMENDATIONS SUMMARY
# ============================================
print("=" * 60)
print("PREPROCESSING RECOMMENDATIONS SUMMARY")
print("=" * 60)

print("""
1. DATA TYPE CONVERSIONS:
   - TotalCharges: Convert from object to float64
   - Handle empty strings in TotalCharges (likely new customers with tenure=0)

2. MISSING VALUES HANDLING:
   - TotalCharges: Fill missing/empty values (median or 0 for new customers)

3. FEATURE REMOVAL:
   - customerID: Remove (not predictive, just identifier)

4. CATEGORICAL ENCODING:
   - Binary columns (Yes/No): Label encoding (0/1)
     * gender, Partner, Dependents, PhoneService, PaperlessBilling, Churn
   - Multi-class columns: One-Hot encoding
     * MultipleLines, InternetService, OnlineSecurity, OnlineBackup
     * DeviceProtection, TechSupport, StreamingTV, StreamingMovies
     * Contract, PaymentMethod

5. NUMERICAL SCALING:
   - Apply StandardScaler or MinMaxScaler to: tenure, MonthlyCharges, TotalCharges
   - Features are on different scales and need normalization

6. CLASS IMBALANCE:
   - Consider class weights, SMOTE, or stratified sampling during train/test split

7. FEATURE ENGINEERING (OPTIONAL):
   - Create tenure groups (e.g., 0-12, 12-24, 24-48, 48-72 months)
   - Create total services count feature
   - Create charge per month ratio
""")

print("=" * 60)
print("PREPROCESSING PIPELINE ORDER:")
print("=" * 60)
print("""
1. Drop customerID column
2. Convert TotalCharges to numeric (handle empty strings)
3. Handle missing values in TotalCharges
4. Encode binary categorical columns
5. One-hot encode multi-class categorical columns
6. Scale numerical features
7. Handle class imbalance (if needed)
""")

PREPROCESSING RECOMMENDATIONS SUMMARY

1. DATA TYPE CONVERSIONS:
   - TotalCharges: Convert from object to float64
   - Handle empty strings in TotalCharges (likely new customers with tenure=0)

2. MISSING VALUES HANDLING:
   - TotalCharges: Fill missing/empty values (median or 0 for new customers)

3. FEATURE REMOVAL:
   - customerID: Remove (not predictive, just identifier)

4. CATEGORICAL ENCODING:
   - Binary columns (Yes/No): Label encoding (0/1)
     * gender, Partner, Dependents, PhoneService, PaperlessBilling, Churn
   - Multi-class columns: One-Hot encoding
     * MultipleLines, InternetService, OnlineSecurity, OnlineBackup
     * DeviceProtection, TechSupport, StreamingTV, StreamingMovies
     * Contract, PaymentMethod

5. NUMERICAL SCALING:
   - Apply StandardScaler or MinMaxScaler to: tenure, MonthlyCharges, TotalCharges
   - Features are on different scales and need normalization

6. CLASS IMBALANCE:
   - Consider class weights, SMOTE, or stratified sampling during train/t