# Logistic Regression
This notebook aims to perform Logistic Regression.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, f1_score, precision_score,roc_auc_score
from collections import Counter
from prettytable import PrettyTable
from textblob import TextBlob

import tensorflow_hub as hub

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
#pip install tensorflow tensorflow-hub scikit-learn

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Set constant

SEED = 4222
EPOCHS = 5

In [None]:
# Change to own directory
try:
    os.chdir("/content/drive/MyDrive/MyProject_SIDetection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Load dataset

In [None]:
# Load dataset
suicide_detection_df = pd.read_csv('data_heavyclean.csv')
suicide_detection_df

Unnamed: 0,text,label
0,ex wife threatening suiciderecently left wife ...,1
1,weird get affected compliment coming someone k...,0
2,finally 2020 almost never hear 2020 bad year e...,0
3,need helpjust help im cry hard,1
4,end tonight anymore quit,1
...,...,...
174290,something today went sledding friend may seem ...,0
174291,like rock going get anything go,0
174292,tell many friend lonely everything deprived pr...,0
174293,pee probably taste like salty tea someone dran...,0


In [None]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['label'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['label'])

### Import vocab

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

### Import embeddings

In [None]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

### Removing out-of-vocab words

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

### Document Vector function

In [None]:
def document_vector(doc, embeddings):
    sentence = list()
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in embeddings.keys()]
    for i in doc:
      word = embeddings[i]
      sentence.append(word)
    return np.mean(sentence, axis=0)

In [None]:
# function for all the data
def all_documents(df, labels_ori, embeddings):
  vec = list()
  labels = list()
  for i in range(len(df)):
    if len(df[i]) == 0:
      continue
    else:
      vec.append(document_vector(df[i], embeddings))
      labels.append(labels_ori.values[i])
  return vec, labels

### Word2Vec

In [None]:
word2vec = load_embedding('embedding_word2vec.txt')

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)
train_vec, train_labels_new = all_documents(train_clean, train_labels,word2vec)
test_vec, test_labels_new = all_documents(test_clean, test_labels, word2vec)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, train_labels_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = lr.predict(train_vec)
print('Training set accuracy %s' % accuracy_score(train_labels_new, y_train_pred))
print(classification_report(train_labels_new, y_train_pred))

auc_score = roc_auc_score(train_labels_new, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.911863215497271
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     85617
           1       0.89      0.89      0.89     53814

    accuracy                           0.91    139431
   macro avg       0.91      0.91      0.91    139431
weighted avg       0.91      0.91      0.91    139431

Training set AUC: 0.9069899543269679


In [None]:
y_test_pred = lr.predict(test_vec)
print('Test set accuracy %s' % accuracy_score(test_labels_new, y_test_pred))
print(classification_report(test_labels_new, y_test_pred))

auc_score = roc_auc_score(test_labels_new, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.9106958318980365
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     21391
           1       0.89      0.88      0.88     13445

    accuracy                           0.91     34836
   macro avg       0.91      0.91      0.91     34836
weighted avg       0.91      0.91      0.91     34836

Testing set AUC: 0.9054560191767793


In [None]:
word2vec_test_accuracy_score = accuracy_score(test_labels_new, y_test_pred)
word2vec_test_precision_score = precision_score(test_labels_new, y_test_pred)
word2vec_test_recall_score = recall_score(test_labels_new, y_test_pred)
word2vec_test_f1_score = f1_score(test_labels_new, y_test_pred)
word2vec_test_auc_score = roc_auc_score(test_labels_new, y_test_pred)

### GloVe

In [None]:
#import urllib.request

# Download the file
#urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')

# Unzip the file
#import zipfile

#with zipfile.ZipFile('glove.6B.zip', 'r') as z:
#  z.extractall()

In [None]:
# load glove embedding from file
raw_embedding_glove = load_embedding('glove.6B.300d.txt')

In [None]:
train_clean_glove = process_lines(train_text, raw_embedding_glove.keys())
test_clean_glove = process_lines(test_text, raw_embedding_glove.keys())
train_vec_glove, train_labels_glove_new = all_documents(train_clean_glove, train_labels, raw_embedding_glove)
test_vec_glove, test_labels_glove_new = all_documents(test_clean_glove, test_labels, raw_embedding_glove)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_glove, train_labels_glove_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = lr.predict(train_vec_glove)
print('Training set accuracy %s' % accuracy_score(train_labels_glove_new, y_train_pred))
print(classification_report(train_labels_glove_new, y_train_pred))

auc_score = roc_auc_score(train_labels_glove_new, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.8824627133485494
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     85564
           1       0.85      0.85      0.85     53762

    accuracy                           0.88    139326
   macro avg       0.88      0.88      0.88    139326
weighted avg       0.88      0.88      0.88    139326

Training set AUC: 0.8759815936785024


In [None]:
y_test_pred = lr.predict(test_vec_glove)
print('Test set accuracy %s' % accuracy_score(test_labels_glove_new, y_test_pred))
print(classification_report(test_labels_glove_new, y_test_pred))

auc_score = roc_auc_score(test_labels_glove_new, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.8831232231571088
              precision    recall  f1-score   support

           0       0.90      0.91      0.91     21379
           1       0.85      0.85      0.85     13444

    accuracy                           0.88     34823
   macro avg       0.88      0.88      0.88     34823
weighted avg       0.88      0.88      0.88     34823

Testing set AUC: 0.8760872687606379


In [None]:
glove_test_accuracy_score = accuracy_score(test_labels_glove_new, y_test_pred)
glove_test_precision_score = precision_score(test_labels_glove_new, y_test_pred)
glove_test_recall_score = recall_score(test_labels_glove_new, y_test_pred)
glove_test_f1_score = f1_score(test_labels_glove_new, y_test_pred)
glove_test_auc_score = roc_auc_score(test_labels_glove_new, y_test_pred)

### TF-IDF

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=20000, stop_words='english')

In [None]:
train_text_tfidf = tfidf_vectorizer.fit_transform(train_text)
test_text_tfidf = tfidf_vectorizer.transform(test_text)

In [None]:
lr = LogisticRegression()
lr.fit(train_text_tfidf, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = lr.predict(train_text_tfidf)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.9363148684701225
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     85622
           1       0.93      0.90      0.92     53814

    accuracy                           0.94    139436
   macro avg       0.94      0.93      0.93    139436
weighted avg       0.94      0.94      0.94    139436

Training set AUC: 0.9292119130706051


In [None]:
y_test_pred = lr.predict(test_text_tfidf)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.9277087696147336
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     21405
           1       0.93      0.88      0.90     13454

    accuracy                           0.93     34859
   macro avg       0.93      0.92      0.92     34859
weighted avg       0.93      0.93      0.93     34859

Testing set AUC: 0.9192272842478444


In [None]:
tfidf_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
tfidf_test_precision_score = precision_score(test_labels, y_test_pred)
tfidf_test_recall_score = recall_score(test_labels, y_test_pred)
tfidf_test_f1_score = f1_score(test_labels, y_test_pred)
tfidf_test_auc_score = roc_auc_score(test_labels, y_test_pred)

### CountVectorizer

In [None]:
# Initialize Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

In [None]:
train_text_cv = count_vectorizer.fit_transform(train_text)
test_text_cv = count_vectorizer.transform(test_text)

In [None]:
lr = LogisticRegression()
lr.fit(train_text_cv, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = lr.predict(train_text_cv)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Testing set AUC: %s' % auc_score)

Training set accuracy 0.9547247482716085
              precision    recall  f1-score   support

           0       0.95      0.98      0.96     85622
           1       0.96      0.92      0.94     53814

    accuracy                           0.95    139436
   macro avg       0.96      0.95      0.95    139436
weighted avg       0.95      0.95      0.95    139436

Testing set AUC: 0.9486030626444077


In [None]:
y_test_pred = lr.predict(test_text_cv)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.9282825095384263
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     21405
           1       0.93      0.88      0.90     13454

    accuracy                           0.93     34859
   macro avg       0.93      0.92      0.92     34859
weighted avg       0.93      0.93      0.93     34859

Testing set AUC: 0.9183968286030346


In [None]:
cv_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
cv_test_precision_score = precision_score(test_labels, y_test_pred)
cv_test_recall_score = recall_score(test_labels, y_test_pred)
cv_test_f1_score = f1_score(test_labels, y_test_pred)
cv_test_auc_score = roc_auc_score(test_labels, y_test_pred)

### Universal Sentence Encoder (USE)

In [None]:
# Load USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def embed(texts):
    return use_model(texts).numpy()

train_text_use = embed(train_text)
test_text_use = embed(test_text)

In [None]:
lr = LogisticRegression()
lr.fit(train_text_use, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = lr.predict(train_text_use)
print('Training set accuracy %s' % accuracy_score(train_labels, y_train_pred))
print(classification_report(train_labels, y_train_pred))

auc_score = roc_auc_score(train_labels, y_train_pred)
print('Training set AUC: %s' % auc_score)

Training set accuracy 0.920113887374853
              precision    recall  f1-score   support

           0       0.93      0.94      0.94     85622
           1       0.90      0.89      0.90     53814

    accuracy                           0.92    139436
   macro avg       0.92      0.91      0.92    139436
weighted avg       0.92      0.92      0.92    139436

Training set AUC: 0.9144945843110167


In [None]:
y_test_pred = lr.predict(test_text_use)
print('Test set accuracy %s' % accuracy_score(test_labels, y_test_pred))
print(classification_report(test_labels, y_test_pred))

auc_score = roc_auc_score(test_labels, y_test_pred)
print('Testing set AUC: %s' % auc_score)

Test set accuracy 0.9201927766143607
              precision    recall  f1-score   support

           0       0.93      0.94      0.94     21405
           1       0.90      0.89      0.90     13454

    accuracy                           0.92     34859
   macro avg       0.92      0.91      0.92     34859
weighted avg       0.92      0.92      0.92     34859

Testing set AUC: 0.914183980804136


In [None]:
use_test_accuracy_score = accuracy_score(test_labels, y_test_pred)
use_test_precision_score = precision_score(test_labels, y_test_pred)
use_test_recall_score = recall_score(test_labels, y_test_pred)
use_test_f1_score = f1_score(test_labels, y_test_pred)
use_test_auc_score = roc_auc_score(test_labels, y_test_pred)

## Summary

In [None]:
table = PrettyTable()
table.field_names = ['Model - Logistic Regression', 'Accuracy', 'Precision', 'Recall', 'F1 Score','AUC Score']

table.add_row(['Word2Vec',
               format(word2vec_test_accuracy_score, '.4f'),
               format(word2vec_test_precision_score, '.4f'),
               format(word2vec_test_recall_score, '.4f'),
               format(word2vec_test_f1_score, '.4f'),
               format(word2vec_test_auc_score, '.4f')])


table.add_row(['GloVe',
               format(glove_test_accuracy_score, '.4f'),
               format(glove_test_precision_score, '.4f'),
               format(glove_test_recall_score, '.4f'),
               format(glove_test_f1_score, '.4f'),
               format(glove_test_auc_score, '.4f')])

table.add_row(['TF-IDF',
               format(tfidf_test_accuracy_score, '.4f'),
               format(tfidf_test_precision_score, '.4f'),
               format(tfidf_test_recall_score, '.4f'),
               format(tfidf_test_f1_score, '.4f'),
               format(tfidf_test_auc_score, '.4f')])

table.add_row(['CountVectorizer',
               format(cv_test_accuracy_score, '.4f'),
               format(cv_test_precision_score, '.4f'),
               format(cv_test_recall_score, '.4f'),
               format(cv_test_f1_score, '.4f'),
               format(cv_test_auc_score, '.4f')])

table.add_row(['Universal sentence encoder',
               format(use_test_accuracy_score, '.4f'),
               format(use_test_precision_score, '.4f'),
               format(use_test_recall_score, '.4f'),
               format(use_test_f1_score, '.4f'),
               format(use_test_auc_score, '.4f')])
print(table)

+-----------------------------+----------+-----------+--------+----------+-----------+
| Model - Logistic Regression | Accuracy | Precision | Recall | F1 Score | AUC Score |
+-----------------------------+----------+-----------+--------+----------+-----------+
|           Word2Vec          |  0.9107  |   0.8857  | 0.8825 |  0.8841  |   0.9055  |
|            GloVe            |  0.8831  |   0.8510  | 0.8452 |  0.8481  |   0.8761  |
|            TF-IDF           |  0.9277  |   0.9271  | 0.8820 |  0.9040  |   0.9192  |
|       CountVectorizer       |  0.9283  |   0.9350  | 0.8751 |  0.9040  |   0.9184  |
|  Universal sentence encoder |  0.9202  |   0.9037  | 0.8878 |  0.8957  |   0.9142  |
+-----------------------------+----------+-----------+--------+----------+-----------+
