# XGBoost
This notebook aims to perform XGBoost

In [66]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [67]:
import pandas as pd
import numpy as np
import os

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, f1_score, precision_score,roc_auc_score
from collections import Counter
from prettytable import PrettyTable
from textblob import TextBlob

import tensorflow_hub as hub

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [68]:
# Set constant

SEED = 4222
EPOCHS = 5

In [69]:
# Change to own directory
try:
    os.chdir("/content/drive/MyDrive/MyProject_SIDetection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Load dataset

In [70]:
# Load dataset
suicide_detection_df = pd.read_csv('data_heavyclean.csv')
suicide_detection_df

Unnamed: 0,text,label
0,ex wife threatening suiciderecently left wife ...,1
1,weird get affected compliment coming someone k...,0
2,finally 2020 almost never hear 2020 bad year e...,0
3,need helpjust help im cry hard,1
4,end tonight anymore quit,1
...,...,...
174290,something today went sledding friend may seem ...,0
174291,like rock going get anything go,0
174292,tell many friend lonely everything deprived pr...,0
174293,pee probably taste like salty tea someone dran...,0


In [71]:
# Split dataset into train, validation and test sets

train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['label'],
                                                                    random_state=SEED,
                                                                    test_size=0.3,
                                                                    stratify=suicide_detection_df['label'])

### Import vocab

In [72]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

### Import embeddings

In [73]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

### Removing out-of-vocab words

In [74]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

### Document Vector function

In [75]:
def document_vector(doc, embeddings):
    sentence = list()
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in embeddings.keys()]
    for i in doc:
      word = embeddings[i]
      sentence.append(word)
    return np.mean(sentence, axis=0)

In [76]:
# function for all the data
def all_documents(df, labels_ori, embeddings):
  vec = list()
  labels = list()
  for i in range(len(df)):
    if len(df[i]) == 0:
      continue
    else:
      vec.append(document_vector(df[i], embeddings))
      labels.append(labels_ori.values[i])
  return vec, labels

### Word2Vec

In [77]:
word2vec = load_embedding('embedding_word2vec.txt')

In [78]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)
train_vec, train_labels_new = all_documents(train_clean, train_labels,word2vec)
test_vec, test_labels_new = all_documents(test_clean, test_labels, word2vec)

In [79]:
xgb = xgb.XGBClassifier()
xgb.fit(train_vec, train_labels_new)

In [80]:
y_train_pred = xgb.predict(train_vec)
print('Training set accuracy %s' % accuracy_score(train_labels_new, y_train_pred))
print(classification_report(train_labels_new, y_train_pred))

auc_score = roc_auc_score(train_labels_new, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9686644481238013
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     74915
           1       0.96      0.96      0.96     47087

    accuracy                           0.97    122002
   macro avg       0.97      0.97      0.97    122002
weighted avg       0.97      0.97      0.97    122002

Training set AUC: 0.9671359774319574


In [81]:
y_test_pred = xgb.predict(test_vec)
print('Test set accuracy %s' % accuracy_score(test_labels_new, y_test_pred))
print(classification_report(test_labels_new, y_test_pred))

auc_score = roc_auc_score(test_labels_new, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.920195159284416
              precision    recall  f1-score   support

           0       0.93      0.94      0.94     32093
           1       0.90      0.89      0.90     20172

    accuracy                           0.92     52265
   macro avg       0.92      0.92      0.92     52265
weighted avg       0.92      0.92      0.92     52265

Testing set AUC: 0.9151480346328156


In [82]:
word2vec_test_accuracy_score = accuracy_score(test_labels_new, y_test_pred)
word2vec_test_precision_score = precision_score(test_labels_new, y_test_pred)
word2vec_test_recall_score = recall_score(test_labels_new, y_test_pred)
word2vec_test_f1_score = f1_score(test_labels_new, y_test_pred)
word2vec_test_auc_score = roc_auc_score(test_labels_new, y_test_pred)

### GloVe

In [83]:
#import urllib.request

# Download the file
#urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')

# Unzip the file
#import zipfile

#with zipfile.ZipFile('glove.6B.zip', 'r') as z:
#  z.extractall()

In [84]:
# load glove embedding from file
raw_embedding_glove = load_embedding('glove.6B.300d.txt')

In [85]:
train_clean_glove = process_lines(train_text, raw_embedding_glove.keys())
test_clean_glove = process_lines(test_text, raw_embedding_glove.keys())
train_vec_glove, train_labels_glove_new = all_documents(train_clean_glove, train_labels, raw_embedding_glove)
test_vec_glove, test_labels_glove_new = all_documents(test_clean_glove, test_labels, raw_embedding_glove)

In [86]:
xgb.fit(train_vec_glove, train_labels_glove_new)

In [87]:
y_train_pred = xgb.predict(train_vec_glove)
print('Training set accuracy %s' % accuracy_score(train_labels_glove_new, y_train_pred))
print(classification_report(train_labels_glove_new, y_train_pred))

auc_score = roc_auc_score(train_labels_glove_new, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9546133149588214
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     74867
           1       0.94      0.94      0.94     47041

    accuracy                           0.95    121908
   macro avg       0.95      0.95      0.95    121908
weighted avg       0.95      0.95      0.95    121908

Training set AUC: 0.9521838861790739


In [88]:
y_test_pred = xgb.predict(test_vec_glove)
print('Test set accuracy %s' % accuracy_score(test_labels_glove_new, y_test_pred))
print(classification_report(test_labels_glove_new, y_test_pred))

auc_score = roc_auc_score(test_labels_glove_new, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.8982791294194216
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     32076
           1       0.88      0.85      0.87     20165

    accuracy                           0.90     52241
   macro avg       0.89      0.89      0.89     52241
weighted avg       0.90      0.90      0.90     52241

Testing set AUC: 0.890123174637276


In [89]:
glove_test_accuracy_score = accuracy_score(test_labels_glove_new, y_test_pred)
glove_test_precision_score = precision_score(test_labels_glove_new, y_test_pred)
glove_test_recall_score = recall_score(test_labels_glove_new, y_test_pred)
glove_test_f1_score = f1_score(test_labels_glove_new, y_test_pred)
glove_test_auc_score = roc_auc_score(test_labels_glove_new, y_test_pred)

### TF-IDF

In [90]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=20000, stop_words='english')

In [91]:
train_text_tfidf = tfidf_vectorizer.fit_transform(train_text)
test_text_tfidf = tfidf_vectorizer.transform(test_text)

In [92]:
xgb.fit(train_text_tfidf, train_labels)

In [93]:
y_train_pred = xgb.predict(train_text_tfidf)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9155697260790453
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     74919
           1       0.93      0.84      0.89     47087

    accuracy                           0.92    122006
   macro avg       0.92      0.90      0.91    122006
weighted avg       0.92      0.92      0.91    122006

Training set AUC: 0.9021558131891539


In [94]:
y_test_pred = xgb.predict(test_text_tfidf)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.9015280460517509
              precision    recall  f1-score   support

           0       0.90      0.95      0.92     32108
           1       0.91      0.82      0.87     20181

    accuracy                           0.90     52289
   macro avg       0.90      0.89      0.89     52289
weighted avg       0.90      0.90      0.90     52289

Testing set AUC: 0.88719166416239


In [95]:
tfidf_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
tfidf_test_precision_score = precision_score(test_labels, y_test_pred)
tfidf_test_recall_score = recall_score(test_labels, y_test_pred)
tfidf_test_f1_score = f1_score(test_labels, y_test_pred)
tfidf_test_auc_score = roc_auc_score(test_labels, y_test_pred)

### CountVectorizer

In [96]:
# Initialize Count Vectorizer
vectorizer = CountVectorizer(stop_words='english')

In [97]:
train_text_cv = vectorizer.fit_transform(train_text)
test_text_cv = vectorizer.transform(test_text)

In [98]:
xgb.fit(train_text_cv, train_labels)

In [99]:
y_train_pred = xgb.predict(train_text_cv)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9097175548743505
              precision    recall  f1-score   support

           0       0.90      0.96      0.93     74919
           1       0.93      0.83      0.88     47087

    accuracy                           0.91    122006
   macro avg       0.91      0.90      0.90    122006
weighted avg       0.91      0.91      0.91    122006

Training set AUC: 0.895114535904552


In [100]:
y_test_pred = xgb.predict(test_text_cv)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.8993096062269311
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     32108
           1       0.91      0.82      0.86     20181

    accuracy                           0.90     52289
   macro avg       0.90      0.88      0.89     52289
weighted avg       0.90      0.90      0.90     52289

Testing set AUC: 0.8841888270686848


In [101]:
cv_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
cv_test_precision_score = precision_score(test_labels, y_test_pred)
cv_test_recall_score = recall_score(test_labels, y_test_pred)
cv_test_f1_score = f1_score(test_labels, y_test_pred)
cv_test_auc_score = roc_auc_score(test_labels, y_test_pred)

### Universal Sentence Encoder (USE)

In [102]:
# Load USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [103]:
def embed(texts):
    return use_model(texts).numpy()

train_text_use = embed(train_text)
test_text_use = embed(test_text)

In [104]:
xgb.fit(train_text_use, train_labels)

In [105]:
y_train_pred = xgb.predict(train_text_use)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9710833893415078
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     74919
           1       0.96      0.96      0.96     47087

    accuracy                           0.97    122006
   macro avg       0.97      0.97      0.97    122006
weighted avg       0.97      0.97      0.97    122006

Training set AUC: 0.9698589194475877


In [106]:
y_test_pred = xgb.predict(test_text_use)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.914781311556924
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     32108
           1       0.90      0.88      0.89     20181

    accuracy                           0.91     52289
   macro avg       0.91      0.91      0.91     52289
weighted avg       0.91      0.91      0.91     52289

Testing set AUC: 0.9085395937785836


In [107]:
use_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
use_test_precision_score = precision_score(test_labels, y_test_pred)
use_test_recall_score = recall_score(test_labels, y_test_pred)
use_test_f1_score = f1_score(test_labels, y_test_pred)
use_test_auc_score = roc_auc_score(test_labels, y_test_pred)

## Summary

In [108]:
table = PrettyTable()
table.field_names = ['Model - XGBoost', 'Accuracy', 'Precision', 'Recall', 'F1 Score','ROC Score']

table.add_row(['Word2Vec',
               format(word2vec_test_accuracy_score, '.4f'),
               format(word2vec_test_precision_score, '.4f'),
               format(word2vec_test_recall_score, '.4f'),
               format(word2vec_test_f1_score, '.4f'),
               format(word2vec_test_auc_score, '.4f')])


table.add_row(['GloVe',
               format(glove_test_accuracy_score, '.4f'),
               format(glove_test_precision_score, '.4f'),
               format(glove_test_recall_score, '.4f'),
               format(glove_test_f1_score, '.4f'),
               format(glove_test_auc_score, '.4f')])

table.add_row(['TF-IDF',
               format(tfidf_test_accuracy_score, '.4f'),
               format(tfidf_test_precision_score, '.4f'),
               format(tfidf_test_recall_score, '.4f'),
               format(tfidf_test_f1_score, '.4f'),
               format(tfidf_test_auc_score, '.4f')])

table.add_row(['CountVectorizer',
               format(cv_test_accuracy_score, '.4f'),
               format(cv_test_precision_score, '.4f'),
               format(cv_test_recall_score, '.4f'),
               format(cv_test_f1_score, '.4f'),
               format(cv_test_auc_score, '.4f')])

table.add_row(['Universal sentence encoder',
               format(use_test_accuracy_score, '.4f'),
               format(use_test_precision_score, '.4f'),
               format(use_test_recall_score, '.4f'),
               format(use_test_f1_score, '.4f'),
               format(use_test_auc_score, '.4f')])
print(table)

+----------------------------+----------+-----------+--------+----------+-----------+
|      Model - XGBoost       | Accuracy | Precision | Recall | F1 Score | ROC Score |
+----------------------------+----------+-----------+--------+----------+-----------+
|          Word2Vec          |  0.9202  |   0.8995  | 0.8930 |  0.8962  |   0.9151  |
|           GloVe            |  0.8983  |   0.8788  | 0.8544 |  0.8664  |   0.8901  |
|           TF-IDF           |  0.9015  |   0.9121  | 0.8243 |  0.8660  |   0.8872  |
|      CountVectorizer       |  0.8993  |   0.9121  | 0.8179 |  0.8625  |   0.8842  |
| Universal sentence encoder |  0.9148  |   0.8963  | 0.8812 |  0.8887  |   0.9085  |
+----------------------------+----------+-----------+--------+----------+-----------+
