# Course Name: **AI Mastery Bootcamp: AI Algorithms, DeepSeek AI, AI Agents**

# Section 8: Week 8: **Model Tuning and Optimization**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
url= "https://raw.githubusercontent.com/nikhilsthorat03/Telco-Customer-Churn/refs/heads/main/telco.csv"
df= pd.read_csv(url)
df.drop("Unnamed: 0", axis=1, inplace=True)
print(df.shape)
df.head(2)

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,Stayed
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,Stayed


In [3]:
# Print dataset info and preview
print("Dataset Info: \n")
print(df.info())

print("\n Class Distribution \n")
print(df['Churn'].value_counts(dropna=False))

Dataset Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non

In [4]:
# Handle Missing Values
df['TotalCharges']= pd.to_numeric(df['TotalCharges'], errors= 'coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Encode Categorical Variables
label_encoder= LabelEncoder()
for column in df.select_dtypes(include= ['object']).columns:
  if column != 'Churn':
    df[column]= label_encoder.fit_transform(df[column])

# Encode target variable
df['Churn']= label_encoder.fit_transform(df['Churn'])

# Scale Numerical Features
scaler= StandardScaler()
numerical_features= ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_features]= scaler.fit_transform(df[numerical_features])

# Feature and Target
X= df.drop(columns= ['Churn'])
y= df['Churn']

# Split dataset
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
# smote= SMOTE(random_state=42)
# X_train_resampled, y_train_resampled= smote.fit_resample(X_train, y_train)

# Print class distribution after SMOTE
# print("\n Class Distribution After SMOTE \n")
# print(pd.Series(y_train_resampled).value_counts(dropna=False))

# Train Random Forest
rf_model= RandomForestClassifier(random_state= 42)
rf_model.fit(X_train, y_train)
y_pred_rf= rf_model.predict(X_test)
rf_initial_accuracy= accuracy_score(y_test, y_pred_rf)
print(f"Initial Random Forest Accuracy: {rf_initial_accuracy:.4f}")
print("Initial Random Forest Classification Report: \n", classification_report(y_test, y_pred_rf))

# Define parameter grid
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV

random_search= RandomizedSearchCV(
    estimator= RandomForestClassifier(random_state= 42),
    param_distributions= param_dist,
    n_iter= 20,
    cv= 5,
    scoring= "accuracy",
    n_jobs= -1,
    random_state= 42
)

# Perform Randomized Search
random_search.fit(X_train, y_train)

# Get Best parameters
best_params= random_search.best_params_
print(f"Best Parameters RandomizedSearchCV: {best_params}")

# Train best model
best_model= random_search.best_estimator_

# Predict and Evaluate
y_pred_tuned= best_model.predict(X_test)
rf_tuned_accuracy= accuracy_score(y_test, y_pred_tuned)
print(f"Tuned Random Forest Model Accuracy: {rf_tuned_accuracy:.4f}")
print("Tuned Random Forest Model Classification Report: \n", classification_report(y_test, y_pred_tuned))

# Evaluate uyour Using cross-validation
cv_scores= cross_val_score(best_model, X, y, cv= 5)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy Scores: {cv_scores.mean():.4f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)


Initial Random Forest Accuracy: 0.8048
Initial Random Forest Classification Report: 
               precision    recall  f1-score   support

           0       0.68      0.50      0.58       373
           1       0.84      0.91      0.87      1036

    accuracy                           0.80      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Best Parameters RandomizedSearchCV: {'n_estimators': 180, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
Tuned Random Forest Model Accuracy: 0.8091
Tuned Random Forest Model Classification Report: 
               precision    recall  f1-score   support

           0       0.68      0.53      0.60       373
           1       0.84      0.91      0.88      1036

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Cross-Validation Accuracy Scores: [0.8097941