In [3]:
# %% [markdown]
# # Model Training for Customer Churn Prediction

# %%
import os
import sys
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Add utils directory to path
sys.path.append(os.path.abspath('../utils'))
from data_preprocessing import preprocess_data

# %%
# 1. Load and preprocess data
df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
X, y = preprocess_data(df)

# %%
# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# %%
# 3. Train model
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # Handles class imbalance
)
model.fit(X_train, y_train)

# %%
# 4. Save model - with directory creation
os.makedirs('../models', exist_ok=True)  # Create directory if it doesn't exist
model_path = '../models/random_forest.pkl'
joblib.dump(model, model_path)
print(f"Model successfully saved to {model_path}")

# %%
# 5. Verify the model was saved
if os.path.exists(model_path):
    print("Model verification: File exists")
    loaded_model = joblib.load(model_path)
    print("Model test accuracy:", loaded_model.score(X_test, y_test))
else:
    print("Error: Model file was not created")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


Model successfully saved to ../models/random_forest.pkl
Model verification: File exists
Model test accuracy: 0.7920511000709723
