# Training XGBoost classifier

In [1]:
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import precision_recall_curve

import plotly.express as px
import plotly.figure_factory as ff

from sklearn import metrics
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('cleaned_data.csv', encoding='utf-8')
df = df.fillna('')    # Occured due to few empty strings

In [3]:
# Building TF-IDF Vector
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', norm='l2')

In [4]:
# XGB Classifier 
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [22]:
# Making Class for fetching and transforming data
class TextSelector(BaseEstimator, TransformerMixin): 
  def __init__(self, key):
    self.key = key

  def fit(self, X, y=None, *parg, **kwarg):
    return self

  def transform(self, X):
    print(type(X))
    return X[self.key]
  
class NumberSelector(BaseEstimator, TransformerMixin):
  def __init__(self, key):
    self.key = key

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X[[self.key]]

In [23]:
# Text data as feature
body = Pipeline([
                  ('selector', TextSelector(key='body')),
                  ('vectorizer', vec_tdidf)
                ])
      
subject = Pipeline([
                  ('selector', TextSelector(key='subject')),
                  ('vectorizer', vec_tdidf)
                ])

no_of_urls = Pipeline([
  ('selector', NumberSelector(key='no_of_urls')),
  ('sc', StandardScaler())
])

image_url = Pipeline([
  ('selector', NumberSelector(key='image_url')),
  ('sc', StandardScaler())
])

# Merging numeric and text features
feats = FeatureUnion([
  ('body', body),
  ('subject', subject),
  ('no_of_urls', no_of_urls),
  ('image_url', image_url)
])

In [24]:
# Defining the final pipeline
pipe = Pipeline([
  ('feats', feats),
  ('clf', clf)
])

In [25]:
features = df[['body', 'subject', 'no_of_urls', 'image_url']]
labels = df['label']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, stratify=labels)

In [14]:
# Grid Search
param_grid = {
     'clf__n_estimators': [50], 
     'clf__colsample_bytree': [0.6],
     'clf__subsample': [0.9]
}

In [15]:
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

In [17]:
grid_search.best_params_

{'clf__colsample_bytree': 0.6, 'clf__n_estimators': 50, 'clf__subsample': 0.9}

In [18]:
clf_test = grid_search.best_estimator_
preds = clf_test.predict(X_test)
train_preds = clf_test.predict(X_train)

## Model Evaluation

In [19]:
def plot_graphs(model, preds):
  
  # Confusion matrix
  test_prob = model.predict_proba(X_test)[:, 1]
  fpr, tpr, thresholds = roc_curve(y_test, test_prob)
  confusion_matrix_test = metrics.confusion_matrix(y_test, preds)

  # ROC curve
  fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate')
  )

  fig.update_yaxes(scaleanchor="x", scaleratio=1)
  fig.update_xaxes(constrain='domain')
  fig.show()

  # Confusion matrix as Heatmap
  hm = ff.create_annotated_heatmap(
    confusion_matrix_test,
    annotation_text=confusion_matrix_test,
    x=['Predicted YES', 'Predicted NO'],
    y=['Actual YES', 'Actual NO']
  )
  hm.show()

  thresh_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
  }, index=thresholds)
  thresh_df.index.name = 'Thresholds'
  thresh_df.columns.name = 'Rate'

  fig_thresh = px.line(
    thresh_df, title='TPR & FPR at every threshold'
  )
  fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
  fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
  fig_thresh.show()

  precision, recall, thresholds = precision_recall_curve(y_test, test_prob)

  pr_curve = px.area(
    x=recall, y=precision, 
    title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='Recall', y='Precision')
  )  

  pr_curve.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0
  )

  pr_curve.update_yaxes(scaleanchor="x", scaleratio=1)
  pr_curve.update_xaxes(constrain='domain')
  pr_curve.show()

In [20]:
def evaluate():
  print('Train Accuracy >> %.3f' % metrics.accuracy_score(y_train, train_preds))
  print('Test Accuracy >> %.3f' % metrics.accuracy_score(y_test, preds))
  print('Train f1_score >> %.3f' % f1_score(y_train, train_preds))
  print('Test f1_score >> %.3f' % f1_score(y_test, preds))

  print(classification_report(y_test, preds))
  
  plot_graphs(clf_test, preds)

In [21]:
evaluate()

Train Accuracy >> 0.998
Test Accuracy >> 0.994
Train f1_score >> 0.998
Test f1_score >> 0.994
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       765
           1       1.00      0.99      0.99       675

    accuracy                           0.99      1440
   macro avg       0.99      0.99      0.99      1440
weighted avg       0.99      0.99      0.99      1440

