In [1]:
import pandas as pd

df_bss = pd.read_csv('C:/My_Projects/churn-prediction-mlops-pipeline/data/raw-data/bss_data.csv')
df_complaints = pd.read_csv('C:/My_Projects/churn-prediction-mlops-pipeline/data/raw-data/complaints_data.csv')
df_network = pd.read_csv('C:/My_Projects/churn-prediction-mlops-pipeline/data/raw-data/network_data.csv')

In [2]:
# === 1. Preprocess BSS data ===
df_bss['TotalCharges'] = pd.to_numeric(df_bss['TotalCharges'], errors='coerce')
df_bss['Churn_Label'] = df_bss['Churn'].map({'Yes': 1, 'No': 0})

In [3]:
# === 2. Preprocess complaints data ===
df_complaints.rename(columns={'date': 'complaint_date', 'issue': 'complaint_type'}, inplace=True)

# Aggregate complaints per customer
agg_complaints = df_complaints.groupby('customerID').agg(
    total_complaints=('complaint_id', 'count'),
    resolved_complaints=('status', lambda x: (x == 'Resolved').sum()),
    unresolved_complaints=('status', lambda x: (x != 'Resolved').sum())
).reset_index()

In [4]:
# === 3. Preprocess network data ===
df_network['call_drop_rate'] = df_network['dropped_calls'] / df_network['total_calls']
df_network['data_usage_gb'] = df_network['data_volume_MB'] / 1024

agg_network = df_network.groupby('customerID').agg({
    'call_drop_rate': 'mean',
    'data_usage_gb': 'mean',
    'throughput_Mbps': 'mean'
}).reset_index()

In [5]:
# === 4. Merge all data ===
df_merged = df_bss.merge(agg_complaints, on='customerID', how='left')
df_merged = df_merged.merge(agg_network, on='customerID', how='left')

In [7]:
# Save preprocessed data
df_merged.to_csv("C:/My_projects/churn-prediction-mlops-pipeline/data/processed-data/preprocessed_data.csv", index=False)