In [2]:
import os
import pandas as pd
if os.path.exists('data/preprocessed_df.csv'):
    df = pd.read_csv('data/preprocessed_df.csv')
    print('Preprocessed_df.csv successfully opened')
else:
    raise FileNotFoundError('File was not found!')

Preprocessed_df.csv successfully opened


In [3]:
y = df['Churn']

In [4]:
x = df.drop(columns=['Churn']).copy() #drop target column from dataset


In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_val_score,StratifiedKFold
from sklearn.preprocessing import RobustScaler,PolynomialFeatures,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_recall_curve,precision_score,recall_score,f1_score,confusion_matrix,ConfusionMatrixDisplay,classification_report


In [None]:
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.3,random_state=42,stratify=y
)

numeric_cols = x_train.select_dtypes(include='number').columns
categorical_cols = x_train.select_dtypes(include='object').columns

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

num_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',RobustScaler()),
    ('poly',PolynomialFeatures(include_bias=False))
])

cat_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipe,numeric_cols),
    ('cat',cat_pipe,categorical_cols)
])

pipe_xgb = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',XGBClassifier(verbosity=0))
])

grid_params = ({
    'preprocessor__num__poly__degree' : [1,2],
    'model__n_estimators' : [80,100],
    'model__learning_rate' : [0.01,0.05,0.1,0.5,1.0],
    'model__max_depth': [3,6,8],
    'model__subsample': [0.6,0.8,1.0],
    'model__colsample_bytree' : [0.3,0.5,0.7,1.0]
})

model = GridSearchCV(estimator=pipe_xgb,
                    param_grid=grid_params,
                    cv = cv,
                    refit = True,
                    scoring= 'accuracy',
                    return_train_score=True,
                    n_jobs=-1,
                    verbose=2,
                    error_score='raise')
print('Training model using XGB Classifier (This may take a while)...')
model.fit(x_train,y_train)
print('Best parameters : ',model.best_params_)