In [None]:
"""
Task 1: Data Preprocessing for Machine Learning
Codveda Technologies - Machine Learning Internship
Dataset: Churn Prediction Data (churn-bigml-80.csv)

Author: [Your Name]
Date: September 29, 2025
"""

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [1]:
print("="*70)
print("TASK 1: DATA PREPROCESSING FOR MACHINE LEARNING")
print("="*70)

TASK 1: DATA PREPROCESSING FOR MACHINE LEARNING


In [3]:
# ==============================================================================
# STEP 1: LOAD THE DATASET
# ==============================================================================
print("\n[STEP 1] Loading Dataset...")
try:
    df = pd.read_csv('churn-bigml-80.csv')
    print(f"✓ Dataset loaded successfully!")
    print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} columns")
except FileNotFoundError:
    print("✗ Error: File 'churn-bigml-80.csv' not found.")
    print("  Please ensure the file is in the same directory as this script.")
    exit()


[STEP 1] Loading Dataset...
✓ Dataset loaded successfully!
  Shape: 2666 rows × 20 columns


In [9]:
# ==============================================================================
# STEP 2: EXPLORATORY DATA ANALYSIS
# ==============================================================================
print("\n[STEP 2] Exploratory Data Analysis...")
print("-"*70)

# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())




[STEP 2] Exploratory Data Analysis...
----------------------------------------------------------------------

First 5 rows of the dataset:
  State  Account length  Area code International plan Voice mail plan  \
0    KS             128        415                 No             Yes   
1    OH             107        415                 No             Yes   
2    NJ             137        415                 No              No   
3    OH              84        408                Yes              No   
4    OK              75        415                Yes              No   

   Number vmail messages  Total day minutes  Total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   
2                      0              243.4              114   
3                      0              299.4               71   
4                      0              166.7              113   

   Total day charge  Total eve minut

In [10]:
# Display statistical summary
print("\nStatistical Summary:")
print(df.describe())


Statistical Summary:
       Account length    Area code  Number vmail messages  Total day minutes  \
count     2666.000000  2666.000000            2666.000000         2666.00000   
mean       100.620405   437.438860               8.021755          179.48162   
std         39.563974    42.521018              13.612277           54.21035   
min          1.000000   408.000000               0.000000            0.00000   
25%         73.000000   408.000000               0.000000          143.40000   
50%        100.000000   415.000000               0.000000          179.95000   
75%        127.000000   510.000000              19.000000          215.90000   
max        243.000000   510.000000              50.000000          350.80000   

       Total day calls  Total day charge  Total eve minutes  Total eve calls  \
count      2666.000000       2666.000000        2666.000000      2666.000000   
mean        100.310203         30.512404         200.386159       100.023631   
std          19.9

In [5]:
# Display basic information
print("\nDataset Info:")
print(df.info())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2

In [7]:
# Check for missing values
print("\nMissing Values Count:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found!")


Missing Values Count:
No missing values found!


In [8]:
# Identify data types
print("\nColumn Data Types:")
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
print(f"  Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"  Categorical columns ({len(categorical_cols)}): {categorical_cols}")


Column Data Types:
  Numerical columns (16): ['Account length', 'Area code', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls']
  Categorical columns (4): ['State', 'International plan', 'Voice mail plan', 'Churn']


In [11]:
# ==============================================================================
# STEP 3: HANDLE MISSING DATA
# ==============================================================================
print("\n[STEP 3] Handling Missing Data...")
print("-"*70)

# Create a copy for preprocessing
df_processed = df.copy()

# Check for missing values
total_missing = df_processed.isnull().sum().sum()

if total_missing > 0:
    print(f"Found {total_missing} missing values. Applying imputation strategies...")
    
    # Strategy 1: Fill numerical columns with median
    if len(numerical_cols) > 0:
        num_imputer = SimpleImputer(strategy='median')
        df_processed[numerical_cols] = num_imputer.fit_transform(df_processed[numerical_cols])
        print(f"✓ Numerical columns: Filled with median")
    
    # Strategy 2: Fill categorical columns with mode
    if len(categorical_cols) > 0:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df_processed[categorical_cols] = cat_imputer.fit_transform(df_processed[categorical_cols])
        print(f"✓ Categorical columns: Filled with most frequent value")
    
    print(f"✓ Missing values after imputation: {df_processed.isnull().sum().sum()}")
else:
    print("✓ No missing values found in the dataset!")



[STEP 3] Handling Missing Data...
----------------------------------------------------------------------
✓ No missing values found in the dataset!


In [12]:
# ==============================================================================
# STEP 4: ENCODE CATEGORICAL VARIABLES
# ==============================================================================
print("\n[STEP 4] Encoding Categorical Variables...")
print("-"*70)

# Separate target variable if it exists (common names: 'Churn', 'Target', etc.)
target_col = None
for col in df_processed.columns:
    if 'churn' in col.lower() or 'target' in col.lower():
        target_col = col
        break

# Create encoding mappings dictionary for reference
encoding_mappings = {}

if len(categorical_cols) > 0:
    for col in categorical_cols:
        unique_values = df_processed[col].nunique()
        
        # Binary columns: Use Label Encoding
        if unique_values == 2:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col])
            encoding_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
            print(f"✓ {col}: Label Encoded (Binary) - {encoding_mappings[col]}")
        
        # Low cardinality (3-10 unique values): Use One-Hot Encoding
        elif unique_values <= 10:
            # Get one-hot encoded columns
            one_hot = pd.get_dummies(df_processed[col], prefix=col, drop_first=True)
            # Drop original column and add one-hot encoded columns
            df_processed = pd.concat([df_processed.drop(col, axis=1), one_hot], axis=1)
            encoding_mappings[col] = f"One-Hot Encoded ({unique_values} categories)"
            print(f"✓ {col}: One-Hot Encoded ({unique_values} unique values)")
        
        # High cardinality: Use Label Encoding (to avoid dimension explosion)
        else:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col])
            encoding_mappings[col] = f"Label Encoded ({unique_values} categories)"
            print(f"✓ {col}: Label Encoded (High cardinality - {unique_values} values)")
    
    print(f"\n✓ All categorical variables encoded successfully!")
    print(f"  Dataset shape after encoding: {df_processed.shape}")
else:
    print("✓ No categorical variables found!")


[STEP 4] Encoding Categorical Variables...
----------------------------------------------------------------------
✓ State: Label Encoded (High cardinality - 51 values)
✓ International plan: Label Encoded (Binary) - {'No': 0, 'Yes': 1}
✓ Voice mail plan: Label Encoded (Binary) - {'No': 0, 'Yes': 1}
✓ Churn: Label Encoded (Binary) - {False: 0, True: 1}

✓ All categorical variables encoded successfully!
  Dataset shape after encoding: (2666, 20)


In [13]:
# ==============================================================================
# STEP 5: NORMALIZE/STANDARDIZE NUMERICAL FEATURES
# ==============================================================================
print("\n[STEP 5] Normalizing/Standardizing Numerical Features...")
print("-"*70)

# Get updated numerical columns (after encoding)
numerical_cols_final = df_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target column from scaling if it exists
if target_col and target_col in numerical_cols_final:
    numerical_cols_final.remove(target_col)
    print(f"Note: Target column '{target_col}' excluded from scaling")

if len(numerical_cols_final) > 0:
    # Initialize StandardScaler (Z-score normalization: mean=0, std=1)
    scaler = StandardScaler()
    
    # Fit and transform numerical features
    df_processed[numerical_cols_final] = scaler.fit_transform(df_processed[numerical_cols_final])
    
    print(f"✓ Standardization applied using StandardScaler (Z-score normalization)")
    print(f"  Features scaled: {len(numerical_cols_final)} columns")
    print(f"  Method: (X - mean) / standard_deviation")
    print(f"\nScaled features statistics:")
    print(df_processed[numerical_cols_final].describe().round(3))
else:
    print("✓ No numerical features to scale!")


[STEP 5] Normalizing/Standardizing Numerical Features...
----------------------------------------------------------------------
Note: Target column 'Churn' excluded from scaling
✓ Standardization applied using StandardScaler (Z-score normalization)
  Features scaled: 16 columns
  Method: (X - mean) / standard_deviation

Scaled features statistics:
       Account length  Area code  Number vmail messages  Total day minutes  \
count        2666.000   2666.000               2666.000           2666.000   
mean           -0.000     -0.000                 -0.000              0.000   
std             1.000      1.000                  1.000              1.000   
min            -2.518     -0.692                 -0.589             -3.311   
25%            -0.698     -0.692                 -0.589             -0.666   
50%            -0.016     -0.528                 -0.589              0.009   
75%             0.667      1.707                  0.807              0.672   
max             3.599    

In [14]:
# ==============================================================================
# STEP 6: SPLIT DATASET INTO TRAINING AND TESTING SETS
# ==============================================================================
print("\n[STEP 6] Splitting Dataset into Training and Testing Sets...")
print("-"*70)

# Identify features (X) and target (y)
if target_col:
    X = df_processed.drop(columns=[target_col])
    y = df_processed[target_col]
    print(f"✓ Target variable identified: '{target_col}'")
else:
    # If no clear target, use last column as target
    X = df_processed.iloc[:, :-1]
    y = df_processed.iloc[:, -1]
    print(f"✓ Using last column as target variable")

# Perform train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y if y.nunique() < 50 else None  # Stratify for classification
)

print(f"\n✓ Dataset split completed!")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"  Testing set:  {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")
print(f"  Features: {X_train.shape[1]} columns")

# Display target distribution
print(f"\nTarget distribution:")
print(f"  Training set:\n{y_train.value_counts(normalize=True).round(3)}")
print(f"  Testing set:\n{y_test.value_counts(normalize=True).round(3)}")


[STEP 6] Splitting Dataset into Training and Testing Sets...
----------------------------------------------------------------------
✓ Target variable identified: 'Churn'

✓ Dataset split completed!
  Training set: 2132 samples (80.0%)
  Testing set:  534 samples (20.0%)
  Features: 19 columns

Target distribution:
  Training set:
0    0.855
1    0.145
Name: Churn, dtype: float64
  Testing set:
0    0.854
1    0.146
Name: Churn, dtype: float64


In [15]:
# ==============================================================================
# STEP 7: SUMMARY AND EXPORT
# ==============================================================================
print("\n[STEP 7] Summary & Final Output...")
print("-"*70)

print("\n📊 PREPROCESSING SUMMARY:")
print(f"  ✓ Original dataset shape: {df.shape}")
print(f"  ✓ Processed dataset shape: {df_processed.shape}")
print(f"  ✓ Missing values handled: Yes")
print(f"  ✓ Categorical encoding: {len(encoding_mappings)} variables encoded")
print(f"  ✓ Numerical scaling: {len(numerical_cols_final)} features standardized")
print(f"  ✓ Train-test split: 80-20 split completed")

print("\n💾 EXPORTABLE DATASETS:")
print("  - X_train: Training features")
print("  - X_test: Testing features")
print("  - y_train: Training labels")
print("  - y_test: Testing labels")

# Optional: Save preprocessed data
try:
    # Save preprocessed full dataset
    df_processed.to_csv('preprocessed_churn_data.csv', index=False)
    print("\n✓ Preprocessed data saved as 'preprocessed_churn_data.csv'")
    
    # Save train-test splits
    X_train.to_csv('X_train.csv', index=False)
    X_test.to_csv('X_test.csv', index=False)
    y_train.to_csv('y_train.csv', index=False)
    y_test.to_csv('y_test.csv', index=False)
    print("✓ Train-test splits saved as separate CSV files")
except Exception as e:
    print(f"\nNote: Could not save files - {str(e)}")

print("\n" + "="*70)
print("✅ DATA PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*70)
print("\n🎯 Next Steps:")
print("  1. Use X_train and y_train to train your ML models")
print("  2. Evaluate model performance using X_test and y_test")
print("  3. Consider feature selection/engineering for optimization")
print("="*70)


[STEP 7] Summary & Final Output...
----------------------------------------------------------------------

📊 PREPROCESSING SUMMARY:
  ✓ Original dataset shape: (2666, 20)
  ✓ Processed dataset shape: (2666, 20)
  ✓ Missing values handled: Yes
  ✓ Categorical encoding: 4 variables encoded
  ✓ Numerical scaling: 16 features standardized
  ✓ Train-test split: 80-20 split completed

💾 EXPORTABLE DATASETS:
  - X_train: Training features
  - X_test: Testing features
  - y_train: Training labels
  - y_test: Testing labels

✓ Preprocessed data saved as 'preprocessed_churn_data.csv'
✓ Train-test splits saved as separate CSV files

✅ DATA PREPROCESSING COMPLETED SUCCESSFULLY!

🎯 Next Steps:
  1. Use X_train and y_train to train your ML models
  2. Evaluate model performance using X_test and y_test
  3. Consider feature selection/engineering for optimization
