In [1]:
# Cell 1: Import all libraries
#
import pandas as pd
import numpy as np
import joblib  # Using joblib instead of pickle, it's more efficient for scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Import the imblearn pipeline and SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
#Cell 2: Load the "v1" Data
#
# --- IMPORTANT ---
# Change this path to point to where you saved your 5,000-row file
DATA_V1_PATH = r"C:\Users\fandi\Downloads\churn_data_v1.csv"
try:
    df = pd.read_csv(DATA_V1_PATH)
    print(f"Data loaded successfully. Shape: {df.shape}")
    print(df.head())
except FileNotFoundError:
    print(f"Error: Could not find file at {DATA_V1_PATH}")
    print("Please update the DATA_V1_PATH variable.")

Data loaded successfully. Shape: (4999, 21)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport Stre

In [3]:
# Cell 3: Basic Preprocessing & Cleaning (NEW ROBUST VERSION)
#
print("\nStarting preprocessing...")

# 1. Fix TotalCharges: convert to numeric and fill missing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
median_charges = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(median_charges) # Safer syntax
print(f"Filled 'TotalCharges' NaNs with median: {median_charges}")

# 2. Drop the customerID column (it's not a feature)
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
    print("Dropped 'customerID' column.")

# 3. *** NEW ROBUST MAPPING ***
#    This checks if the column is *already* numbers. If it is, it skips the map.
#    This prevents the "re-run" error!
if pd.api.types.is_string_dtype(df['Churn']):
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    print("Mapped 'Churn' from string ('Yes'/'No') to 1/0.")
else:
    print("'Churn' column is already numeric, mapping skipped.")

# 4. Handle potential missing target values
#    This was our original 'dirty data' fix. It's still good to keep.
initial_rows = len(df)
df.dropna(subset=['Churn'], inplace=True)
final_rows = len(df)
if initial_rows > final_rows:
    print(f"Dropped {initial_rows - final_rows} rows with missing 'Churn' (target) values.")

# 5. Convert Churn to integer (it's now 0.0 or 1.0)
df['Churn'] = df['Churn'].astype(int)
print("Preprocessing complete.")


Starting preprocessing...
Filled 'TotalCharges' NaNs with median: 1397.65
Dropped 'customerID' column.
Mapped 'Churn' from string ('Yes'/'No') to 1/0.
Preprocessing complete.


In [4]:
# Cell 4: Define Features (X) and Target (y)
#
X = df.drop('Churn', axis=1)
y = df['Churn']

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Class distribution:\n{y.value_counts(normalize=True)}")


Features (X) shape: (4999, 19)
Target (y) shape: (4999,)
Class distribution:
Churn
0    0.737347
1    0.262653
Name: proportion, dtype: float64


In [5]:
# Cell 5: Build the Preprocessing Pipeline
#
print("\nBuilding preprocessing pipeline...")

# Define which columns are which type
# Note: SeniorCitizen is 0/1 but we'll treat it as categorical for one-hot encoding
categorical_features = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough' # Pass through any columns not listed
)

print("Preprocessor created.")


Building preprocessing pipeline...
Preprocessor created.


In [6]:
# Cell 6: Split Data into Train and Test Sets
#
# We use stratify=y to ensure the test set has the same class imbalance as the full dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Test set class distribution:\n{y_test.value_counts(normalize=True)}")


Training set shape: (3999, 19)
Test set shape: (1000, 19)
Test set class distribution:
Churn
0    0.737
1    0.263
Name: proportion, dtype: float64


In [7]:
# Cell 7: Create the Full Model Pipeline (Handling Imbalance)
#
print("\nCreating full model pipeline with SMOTE...")

# We use the special Pipeline from imbalanced-learn
# This pipeline correctly applies SMOTE only to the training data during .fit()
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

print("Pipeline created successfully.")
print(model_pipeline)


Creating full model pipeline with SMOTE...
Pipeline created successfully.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['tenure', 'MonthlyCharges',
                                                   'TotalCharges']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['gender', 'SeniorCitizen',
                                                   'Partner', 'Dependents',
                                                   'PhoneService',
                                                   'MultipleLines',
                                                   'InternetService',
                  

In [8]:
# Cell 8: Train the Model
#
print("\nTraining the baseline model (v1)...")
# This one command runs all steps: preprocess, smote, train
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training the baseline model (v1)...



Training the baseline model (v1)...





Training the baseline model (v1)...




Model training complete.


In [9]:
# Cell 9: Evaluate the "v1" Model (Handling Imbalance)
#
print("\nEvaluating baseline model on the test set...")
y_pred = model_pipeline.predict(X_test)

# --- KEY METRIC ---
# We focus on F1-score for the positive (Churn) class, not overall accuracy
f1 = f1_score(y_test, y_pred, pos_label=1)
print(f"*********************************************")
print(f"** Baseline Model F1-Score (Class 1 'Churn'): {f1:.4f} **")
print(f"*********************************************")

print("\nFull Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn (0)', 'Churn (1)']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Evaluating baseline model on the test set...
*********************************************
** Baseline Model F1-Score (Class 1 'Churn'): 0.6000 **
*********************************************

Full Classification Report:
              precision    recall  f1-score   support

No Churn (0)       0.86      0.84      0.85       737
   Churn (1)       0.58      0.62      0.60       263

    accuracy                           0.78      1000
   macro avg       0.72      0.73      0.73      1000
weighted avg       0.79      0.78      0.79      1000


Confusion Matrix:
[[622 115]
 [101 162]]


In [10]:
# Cell 10: Analyze Bias (Handling Bias)
#
print("\n--- Manual Bias Check (SeniorCitizen) ---")
# We test our pipeline's performance on a specific slice of the test data
# The model_pipeline.predict() command will correctly preprocess this slice
try:
    # Slice 1: Senior Citizens
    X_test_seniors = X_test[X_test['SeniorCitizen'] == 1]
    y_test_seniors = y_test.loc[X_test_seniors.index]
    y_pred_seniors = model_pipeline.predict(X_test_seniors)
    f1_seniors = f1_score(y_test_seniors, y_pred_seniors, pos_label=1)
    print(f"F1-Score for Seniors (1): {f1_seniors:.4f} (on {len(y_test_seniors)} samples)")

    # Slice 2: Non-Senior Citizens
    X_test_non_seniors = X_test[X_test['SeniorCitizen'] == 0]
    y_test_non_seniors = y_test.loc[X_test_non_seniors.index]
    y_pred_non_seniors = model_pipeline.predict(X_test_non_seniors)
    f1_non_seniors = f1_score(y_test_non_seniors, y_pred_non_seniors, pos_label=1)
    print(f"F1-Score for Non-Seniors (0): {f1_non_seniors:.4f} (on {len(y_test_non_seniors)} samples)")
    
    print("\nNote: A large difference in F1-scores indicates potential bias.")
    print("We will automate this check in our MLOps pipeline.")

except Exception as e:
    print(f"Could not perform bias check: {e}")


--- Manual Bias Check (SeniorCitizen) ---
F1-Score for Seniors (1): 0.6232 (on 154 samples)
F1-Score for Non-Seniors (0): 0.5920 (on 846 samples)

Note: A large difference in F1-scores indicates potential bias.
We will automate this check in our MLOps pipeline.


In [11]:
# Cell 11: Save Your Pipeline Object
#
MODEL_SAVE_PATH = "model_v1.pkl"
print(f"\nSaving model pipeline to {MODEL_SAVE_PATH}...")

# Use joblib.dump to save the entire pipeline (preprocessor + smote + model)
joblib.dump(model_pipeline, MODEL_SAVE_PATH)

print(f"Model saved successfully to {MODEL_SAVE_PATH}.")
print("\n--- STEP 1 COMPLETE ---")


Saving model pipeline to model_v1.pkl...
Model saved successfully to model_v1.pkl.

--- STEP 1 COMPLETE ---


In [3]:
import sys
# FIX: Add the project root to sys.path so it can find scripts/utils.py
if '.' not in sys.path:
    sys.path.append('.')

import pandas as pd
from sklearn.model_selection import train_test_split
import scripts.utils as utils  # To get the TARGET_COLUMN name
import pandas.api.types # Required for the is_string_dtype check

# --- RE-LOAD AND RE-SPLIT DATA ---
# NOTE: This MUST use the same random_state=42 and stratify=y as before 
# to generate the exact same test set.

# Assuming your original v1 data file is still accessible.
df = pd.read_csv("churn_data_v1.csv")

# Perform the same preprocessing to clean TotalCharges before splitting
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
median_charges = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(median_charges)
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)

# Map the target variable 'Churn' to 0 and 1 (robust fix)
if pd.api.types.is_string_dtype(df[utils.TARGET_COLUMN]):
    df[utils.TARGET_COLUMN] = df[utils.TARGET_COLUMN].map({'Yes': 1, 'No': 0})
    
df.dropna(subset=[utils.TARGET_COLUMN], inplace=True)
df[utils.TARGET_COLUMN] = df[utils.TARGET_COLUMN].astype(int)

# --- Define X and y (Crucial for stratification) ---
X = df.drop(utils.TARGET_COLUMN, axis=1)
y = df[utils.TARGET_COLUMN]


# --- Re-split to grab the X_test and y_test sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Create the Golden Test Set File ---
# Combine X_test and y_test back into one DataFrame
test_set_df = pd.concat([X_test, y_test], axis=1)

# Save the combined DataFrame to a CSV file in your project root
test_set_df.to_csv("test_set.csv", index=False)

print(f"Golden Test Set saved successfully! Shape: {test_set_df.shape}")

ModuleNotFoundError: No module named 'scripts.utils'