In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import json
import sys

In [2]:
# Import the DataPreprocessor class

sys.path.append('/content/drive/MyDrive')

# Import
from preprocess import DataPreprocessor


In [3]:
# Load configuration file
with open("/content/drive/MyDrive/config_process.yaml", "r") as file:
    config_process = yaml.safe_load(file)

# Preview configuration sections
print("Configuration sections:", list(config_process.keys()))

Configuration sections: ['logging', 'output_path', 'target_column', 'drop_columns', 'numerical_features', 'categorical_features', 'combined_features', 'schema']


In [4]:
# Load raw dataset
raw_data_path = "/content/drive/MyDrive/cell2cell-duke univeristy.csv"
data_raw = pd.read_csv(raw_data_path)

print("Raw data preview:")
display(data_raw.head())

Raw data preview:


Unnamed: 0.1,Unnamed: 0,X,customer,traintest,churn,churndep,revenue,mou,recchrge,directas,...,retaccpt,newcelly,newcelln,refer,incmiss,income,mcycle,setprcm,setprc,retcall
0,1,1,1000002,0,0,,57.4925,482.75,37.424999,0.2475,...,0,0,1,0,0,5,0,0,149.98999,0
1,2,2,1000006,0,0,,82.275002,1312.25,75.0,1.2375,...,0,1,0,0,0,6,0,0,9.989998,0
2,3,3,1000010,0,0,,31.6625,25.5,29.99,0.2475,...,0,0,1,0,0,9,0,0,29.98999,0
3,4,4,1000011,0,0,,62.127499,97.5,65.985001,2.475,...,0,1,0,0,0,6,0,0,29.98999,0
4,5,5,1000014,0,0,,25.225,2.5,25.0,0.0,...,0,1,0,0,0,7,0,0,29.98999,0


In [5]:
# Initial Data Inspection

print("Data Info:")
data_raw.info()
print("\nMissing Values (Top 10):")
print(data_raw.isnull().sum().sort_values(ascending=False).head(10))

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71047 entries, 0 to 71046
Data columns (total 72 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  71047 non-null  int64  
 1   X           71047 non-null  int64  
 2   customer    71047 non-null  int64  
 3   traintest   71047 non-null  int64  
 4   churn       71047 non-null  int64  
 5   churndep    40000 non-null  float64
 6   revenue     70831 non-null  float64
 7   mou         70831 non-null  float64
 8   recchrge    70831 non-null  float64
 9   directas    70831 non-null  float64
 10  overage     70831 non-null  float64
 11  roam        70831 non-null  float64
 12  changem     70545 non-null  float64
 13  changer     70545 non-null  float64
 14  dropvce     71047 non-null  float64
 15  blckvce     71047 non-null  float64
 16  unansvce    71047 non-null  float64
 17  custcare    71047 non-null  float64
 18  threeway    71047 non-null  float64
 19  mourec      71

In [6]:
# Initialize DataPreprocessor
preprocessor = DataPreprocessor(
    config_path="/content/drive/MyDrive/config_process.yaml",
    data_raw=data_raw
)

print(f"Data shape: {preprocessor.df.shape}, Target column: {preprocessor.target_col}")

Data shape: (71047, 72), Target column: churn


Preprocessing Pipeline


In [7]:
# Check output folder exists
os.makedirs('data/processed', exist_ok=True)

# Run the preprocessing
print("\nRunning preprocessing pipeline...")
processed_df = preprocessor.run_preprocessing_pipeline()

print("\nProcessed data preview:")
display(processed_df.head())



Running preprocessing pipeline...
Starting full preprocessing pipeline...
Preprocessing pipeline completed successfully.

Processed data preview:


Unnamed: 0,churn,revenue,mou,recchrge,directas,roam,changem,changer,dropvce,blckvce,...,refer,incmiss,income,mcycle,setprcm,setprc,retcall,engagement_index,model_change_rate,overage_ratio
0,0,-0.03008,-0.080266,-0.395564,-0.293698,-0.134256,2.134572,1.350836,0.257978,-0.2875,...,-0.175091,0,0.212229,0,0,2.001934,0,-0.346504,0.290128,-0.063587
1,0,0.530869,1.48661,1.17801,0.157401,-0.134256,0.658604,0.241872,5.106564,0.337263,...,-0.175091,0,0.531001,0,0,-0.45246,0,1.762407,-0.458137,-0.6835
2,0,-0.614739,-0.943985,-0.706928,-0.293698,-0.134256,0.276347,0.135291,-0.667325,-0.2875,...,-0.175091,0,1.487316,0,0,-0.101833,0,-0.599625,-1.082082,-0.6835
3,0,0.074833,-0.807981,0.800478,0.721275,-0.134256,0.134842,0.207639,-0.667325,-0.349977,...,-0.175091,0,0.531001,0,0,-0.101833,0,-0.582883,-0.781251,-0.6835
4,0,-0.760451,-0.98743,-0.9159,-0.406473,-0.134256,0.032645,0.025217,-0.667325,-0.381215,...,-0.175091,0,0.849772,0,0,-0.101833,0,-0.597934,-1.032563,-0.6835


In [8]:
# Post-Processing Validation
print("\nChecking for missing values:")
print(processed_df.isnull().sum().sum())

print("\nSummary Statistics:")
display(processed_df.describe().T.head(10))

print("\nTarget variable distribution:")
print(processed_df[preprocessor.target_col].value_counts())


Checking for missing values:
0

Summary Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
churn,71047.0,0.2900756,0.4538,0.0,0.0,0.0,1.0,1.0
revenue,71047.0,-6.160631e-17,1.000007,-1.471016,-0.568705,-0.232945,0.273567,26.359633
mou,71047.0,-2.2402290000000002e-17,1.000007,-0.992153,-0.692283,-0.3008,0.367413,13.491777
recchrge,71047.0,6.960713000000001e-17,1.000007,-2.43566,-0.706509,-0.078755,0.549418,14.788007
directas,71047.0,-3.9204010000000006e-17,1.000007,-0.406473,-0.406473,-0.293698,0.044626,72.220516
roam,71047.0,8.000819e-18,1.000007,-0.134256,-0.134256,-0.134256,-0.105858,122.55144
changem,71047.0,-7.200737e-18,1.000007,-15.188884,-0.280826,0.022818,0.297474,20.451506
changer,71047.0,-8.60088e-18,1.000007,-28.642462,-0.149634,0.023405,0.070644,64.315205
dropvce,71047.0,-9.600983e-18,1.000007,-0.667325,-0.593301,-0.334216,0.183954,23.945729
blckvce,71047.0,-3.680377e-17,1.000007,-0.381215,-0.381215,-0.2875,-0.037595,35.636382



Target variable distribution:
churn
0    50438
1    20609
Name: count, dtype: int64


In [10]:
# Check saved data and artifacts

print("\nFiles in data/processed/:", os.listdir("data/processed/"))

with open("data/processed/preprocessing_artifacts.json", "r") as f:
    artifacts = json.load(f)

print("Artifacts keys:", artifacts.keys())


Files in data/processed/: ['processed_data.csv', 'preprocessing_artifacts.json']
Artifacts keys: dict_keys(['label_encoders', 'scaler_params', 'feature_names', 'preprocessing_timestamp'])


In [11]:
#  Verify Saved Outputs
print("\nFiles in data/processed/:", os.listdir("data/processed/"))

with open("data/processed/preprocessing_artifacts.json", "r") as f:
    artifacts = json.load(f)

print("Artifacts keys:", artifacts.keys())



Files in data/processed/: ['processed_data.csv', 'preprocessing_artifacts.json']
Artifacts keys: dict_keys(['label_encoders', 'scaler_params', 'feature_names', 'preprocessing_timestamp'])
