In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 1.3/124.9 MB 4.8 MB/s eta 0:00:26
    --------------------------------------- 2.4/124.9 MB 4.8 MB/s eta 0:00:26
   - -------------------------------------- 3.4/124.9 MB 4.7 MB/s eta 0:00:26
   - -------------------------------------- 4.5/124.9 MB 4.7 MB/s eta 0:00:26
   - -------------------------------------- 5.5/124.9 MB 4.7 MB/s eta 0:00:26
   -- ------------------------------------- 6.3/124.9 MB 4.7 MB/s eta 0:00:26
   -- ------------------------------------- 7.3/124.9 MB 4.7 MB/s eta 0:00:26
   -- ------------------------------------- 8.4/124.9 MB 4.7 MB/s eta 0:00:25
   -- ------------------------------------- 9.2/124.9 MB 4.7 MB/s eta 0:00:25
   --- -

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

In [3]:
# Load the pre-generated customer data CSV
df = pd.read_csv('customer_data_large.csv')
print(df.head)

<bound method NDFrame.head of       Customer_ID   Age  Tenure  Monthly_Usage  Complaints  Returns  \
0               1  56.0      29      69.302250           3        0   
1               2  69.0      44     215.284417           4        2   
2               3  46.0      53     378.077243           1        2   
3               4  32.0      24      44.793756           0        0   
4               5  60.0      58      49.288207           3        1   
...           ...   ...     ...            ...         ...      ...   
1995         1996  63.0      30     325.538724           2        1   
1996         1997  67.0      33     196.666404           1        2   
1997         1998  69.0      36      36.737050           0        2   
1998         1999  24.0      18     303.735958           3        2   
1999         2000  20.0      37     452.177605           3        2   

      Emails_Opened  Daily_Logins  Sensor_Triggers Event_Timestamp  Churn  
0               9.0             2        

In [7]:
# Preprocessing: Handling missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Monthly_Usage', 'Emails_Opened']] = imputer.fit_transform(df[['Age', 'Monthly_Usage', 'Emails_Opened']])

# Feature Selection: Drop non-numeric or irrelevant features (like Customer_ID and Event_Timestamp)
df = df.drop(columns=['Customer_ID', 'Event_Timestamp'])

# Feature Scaling: Standardize numeric features
scaler = StandardScaler()
df[['Age', 'Tenure', 'Monthly_Usage', 'Complaints', 'Returns', 'Emails_Opened', 'Daily_Logins', 'Sensor_Triggers']] = scaler.fit_transform(
    df[['Age', 'Tenure', 'Monthly_Usage', 'Complaints', 'Returns', 'Emails_Opened', 'Daily_Logins', 'Sensor_Triggers']]
)

In [9]:
# Split the data into train and test sets (80% training, 20% testing)
X = df.drop(columns='Churn')  # Features
y = df['Churn']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Convert to DMatrix (specific to XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [17]:
# Handle class imbalance: Calculate scale_pos_weight
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)

In [23]:
# Parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'learning_rate': 0.1,
    'scale_pos_weight': scale_pos_weight  # Handle class imbalance
}

In [25]:
# Train the model and track training history
evals = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(
    params, 
    dtrain, 
    num_boost_round=100, 
    evals=evals, 
    early_stopping_rounds=10, 
    verbose_eval=True
)


[0]	train-logloss:0.60503	eval-logloss:0.60415
[1]	train-logloss:0.53317	eval-logloss:0.53099
[2]	train-logloss:0.47257	eval-logloss:0.46978
[3]	train-logloss:0.42190	eval-logloss:0.41820
[4]	train-logloss:0.37822	eval-logloss:0.37414
[5]	train-logloss:0.34045	eval-logloss:0.33548
[6]	train-logloss:0.30774	eval-logloss:0.30325
[7]	train-logloss:0.27904	eval-logloss:0.27456
[8]	train-logloss:0.25398	eval-logloss:0.24967
[9]	train-logloss:0.23140	eval-logloss:0.22754
[10]	train-logloss:0.21142	eval-logloss:0.20863
[11]	train-logloss:0.19366	eval-logloss:0.19188
[12]	train-logloss:0.17799	eval-logloss:0.17729
[13]	train-logloss:0.16343	eval-logloss:0.16386
[14]	train-logloss:0.15036	eval-logloss:0.15118
[15]	train-logloss:0.13881	eval-logloss:0.14061
[16]	train-logloss:0.12867	eval-logloss:0.13108
[17]	train-logloss:0.11975	eval-logloss:0.12246
[18]	train-logloss:0.11142	eval-logloss:0.11594
[19]	train-logloss:0.10378	eval-logloss:0.10939
[20]	train-logloss:0.09667	eval-logloss:0.10322
[2

In [27]:
# Make predictions
xgb_pred = xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration))
xgb_pred = (xgb_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

In [31]:
from sklearn.metrics import classification_report, confusion_matrix
# Evaluate the model
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_pred))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       305
           1       0.96      0.95      0.95        95

    accuracy                           0.98       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.98      0.98      0.98       400

XGBoost Confusion Matrix:
 [[301   4]
 [  5  90]]


In [None]:
import joblib
# Save the model
joblib.dump(rf_model, 'models/Xgboost_model_v1.pkl')