# Course Name: **AI Mastery Bootcamp: AI Algorithms, DeepSeek AI, AI Agents**

# Section 7: Week 7: **Advanced Machine Learning Algorithms**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [6]:
url= "https://raw.githubusercontent.com/nikhilsthorat03/Telco-Customer-Churn/refs/heads/main/telco.csv"
df= pd.read_csv(url)
df.drop("Unnamed: 0", axis=1, inplace=True)
print(df.shape)
df.head(2)

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,Stayed
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,Stayed


In [7]:
# Print dataset info and preview
print("Dataset Info: \n")
print(df.info())

print("\n Class Distribution \n")
print(df['Churn'].value_counts(dropna=False))

Dataset Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non

In [8]:
df['TotalCharges']= pd.to_numeric(df['TotalCharges'], errors= 'coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Encode Categorical Variables
label_encoder= LabelEncoder()
for column in df.select_dtypes(include= ['object']).columns:
  if column != 'Churn':
    df[column]= label_encoder.fit_transform(df[column])

# Encode target variable
df['Churn']= label_encoder.fit_transform(df['Churn'])

# Scale Numerical Features
scaler= StandardScaler()
numerical_features= ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_features]= scaler.fit_transform(df[numerical_features])

# Feature and Target
X= df.drop(columns= ['Churn'])
y= df['Churn']

# Split dataset
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote= SMOTE(random_state=42)
X_train_resampled, y_train_resampled= smote.fit_resample(X_train, y_train)

# Print class distribution after SMOTE
print("\n Class Distribution After SMOTE \n")
print(pd.Series(y_train_resampled).value_counts(dropna=False))

# Train Random Forest
rf_model= RandomForestClassifier(random_state= 42)
rf_model.fit(X_train_resampled, y_train_resampled)
y_pred_rf= rf_model.predict(X_test)
roc_auc_rf= roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# Train XGBoost
xgb_model= XGBClassifier(eval_metric= 'logloss', random_state= 42)
xgb_model.fit(X_train_resampled, y_train_resampled)
y_pred_xgb= xgb_model.predict(X_test)
roc_auc_xgb= roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

# Train LightGBM
lbg_model= LGBMClassifier(random_state= 42)
lbg_model.fit(X_train_resampled, y_train_resampled)
y_pred_lgb= lbg_model.predict(X_test)
roc_auc_lgb= roc_auc_score(y_test, lbg_model.predict_proba(X_test)[:, 1])

# Classification Report
print("Random Forest Report: \n", classification_report(y_test, y_pred_rf))
print("\n XGBoost Report: \n", classification_report(y_test, y_pred_xgb))
print("\n LightGBM Report: \n", classification_report(y_test, y_pred_lgb))

# ROC-AUC Comparison
print("ROC-AUC Scores: \n")
print(f"Random Forest: {roc_auc_rf:.2f}")
print(f"XGBoost: {roc_auc_xgb:.2f}")
print(f"LightGBM: {roc_auc_lgb:.2f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)



 Class Distribution After SMOTE 

Churn
1    4138
0    4138
Name: count, dtype: int64
[LightGBM] [Info] Number of positive: 4138, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 8276, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Random Forest Report: 
               precision    recall  f1-score   support

           0       0.58      0.61      0.60       373
           1       0.86      0.84      0.85      1036

    accuracy                           0.78      1409
   macro avg       0.72      0.73      0.72      1409
weighted avg       0.78      0.78      0.78      1409


 XGBoost Report: 
               precision    recall  f1-scor