In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Create directory for artifacts
os.makedirs('processed', exist_ok=True)

# Load raw data
df = pd.read_csv('telecom_churn.csv')

In [3]:
# Create a feature for 'Minutes per Call' to see if long conversations correlate with churn
df['MinsPerDayCall'] = df['DayMins'] / df['DayCalls'].replace(0, 1) 

# Check if users with DataPlans are actually using data
df['UnderusingData'] = ((df['DataPlan'] == 1) & (df['DataUsage'] == 0)).astype(int)

In [4]:
X = df.drop('Churn', axis=1)
y = df['Churn']

# 80% Train, 20% Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training shapes: {X_train.shape}, {y_train.shape}")
print(f"Testing shapes: {X_test.shape}, {y_test.shape}")

Training shapes: (267, 12), (267,)
Testing shapes: (67, 12), (67,)


In [5]:
scaler = StandardScaler()

# Fit on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the SAME mean and std from training
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to keep column names
X_train_final = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_final = pd.DataFrame(X_test_scaled, columns=X.columns)

In [6]:
X_train_final.to_csv('processed/X_train.csv', index=False)
X_test_final.to_csv('processed/X_test.csv', index=False)
y_train.to_csv('processed/y_train.csv', index=False)
y_test.to_csv('processed/y_test.csv', index=False)

print("Data processing complete. Files saved to data/processed/")

Data processing complete. Files saved to data/processed/
