This is a second attempt at the comment toxicity classification project, utilising a different methodology in the hope of achieving better metrics on the models.

## Setup & Functions

In [1]:
# Imports and Setup
import os
import re
import email
import random
import collections
import itertools
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import nltk
from string import punctuation
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk import classify
from sklearn import metrics, preprocessing, feature_extraction, linear_model, naive_bayes, ensemble, pipeline, svm, model_selection, decomposition
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.datasets import make_classification
from sklearn import tree, datasets
from scipy import stats
from scipy.sparse import csr_matrix
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional
from sklearn.decomposition import TruncatedSVD
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')

# Ensure NLTK libraries up to date:
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')

# Tell TensorFlow to run all deep learning on GPU:
tf.config.set_soft_device_placement
tf.test.is_built_with_cuda()
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Define functions for later use:

# Feature extractor and vectorizer, to produce TF-IDF document matrix:
def feature_vectorizer(corpus):
    '''Preprocesses entire body of text data.'''
    sa_stop_words = nltk.corpus.stopwords.words('english')
    '''Create a list of exceptions, as these stopwords may change a sentence's sentiment if removed.'''
    sa_white_list = ['what', 'but', 'if', 'because', 'as', 'until', 'against', 'up', 'down', 'in', 'out',
                    'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'why',
                    'how', 'all', 'any', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
                    'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should']
    '''Remove stop words except for those specified in the white list.'''
    sa_stop_words = [sw for sw in sa_stop_words if sw not in sa_white_list]
    '''Instantiate the vectorizer.'''
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,
        tokenizer=nltk.word_tokenize,
        min_df=2,
        ngram_range=(1, 2),
        stop_words=sa_stop_words
    )
    '''Run the vectorizer on the body of text ('corpus').'''
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(processed_corpus)
    return processed_corpus


# Data checker to perform basic EDA and check for nulls, duplicates, etc.
def data_integrity_check(df, title='', include_non_numeric=True):
    results = []
    for col in df:
        result = {
            'Column': col,
            'Null Values': df[col].isnull().sum(),
            'Duplicate Values': df[col].duplicated().sum(),
            'Data Type': df[col].dtype
        }
        if include_non_numeric or df[col].dtype in ['int64', 'float64']:
            result['Unique Values'] = df[col].nunique()
            if df[col].dtype in ['int64', 'float64']:
                result['Mean'] = df[col].mean()
                result['Median'] = df[col].median()
                result['Mode'] = stats.mode(df[col])
                result['Range'] = df[col].max() - df[col].min()
                result['Skew'] = df[col].skew()
                result['Kurtosis'] = df[col].kurtosis()
        if df[col].dtype == 'object':  
            result['Min Text Length'] = df[col].str.len().min()
            result['Max Text Length'] = df[col].str.len().max()
            '''Calculate mean and median text lengths'''
            text_lengths = df[col].str.len()
            result['Mean Text Length'] = np.mean(text_lengths)
            result['Median Text Length'] = np.median(text_lengths)
        results.append(result)
    result_df = pd.DataFrame(results)
    result_df['Source'] = title
    return result_df

## Preprocessing

In [3]:
# Load in datasets.

# Training data with text and classifications:
df_train = pd.read_csv('sa_train.csv')

# Classification labels for test data:
df_test_labels = pd.read_csv('sa_test_labels.csv')

# Test data with text only:
df_test_data = pd.read_csv('sa_test_data.csv')

print(f'Training Data Shape: ', df_train.shape)
print(f'Training Data Columns: ', df_train.columns)
print(f'Test Label Data Shape: ', df_test_labels.shape)
print(f'Test Label Data Columns: ', df_test_labels.columns)
print(f'Test Data Shape: ', df_test_data.shape)
print(f'Test Data Columns: ', df_test_data.columns)

Training Data Shape:  (159571, 8)
Training Data Columns:  Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
Test Label Data Shape:  (153164, 7)
Test Label Data Columns:  Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')
Test Data Shape:  (153164, 2)
Test Data Columns:  Index(['id', 'comment_text'], dtype='object')


In [4]:
# Run integrity function on datasets:
df_train_result = data_integrity_check(df_train, title='df_train')
df_test_data_result = data_integrity_check(df_test_data, title='df_test_data')
df_test_labels_result = data_integrity_check(df_test_labels, title='df_test_labels')

# Concatenate the results for ease of reading:
concat_checks = pd.concat([df_train_result, df_test_data_result, df_test_labels_result], ignore_index=True)
concat_checks

Unnamed: 0,Column,Null Values,Duplicate Values,Data Type,Unique Values,Min Text Length,Max Text Length,Mean Text Length,Median Text Length,Mean,Median,Mode,Range,Skew,Kurtosis,Source
0,id,0,0,object,159571,15.0,21.0,16.00042,16.0,,,,,,,df_train
1,comment_text,0,0,object,159571,6.0,5000.0,396.593961,207.0,,,,,,,df_train
2,toxic,0,159569,int64,2,,,,,0.095844,0.0,"(0, 144277)",1.0,2.745854,5.539784,df_train
3,severe_toxic,0,159569,int64,2,,,,,0.009996,0.0,"(0, 157976)",1.0,9.851722,95.057627,df_train
4,obscene,0,159569,int64,2,,,,,0.052948,0.0,"(0, 151122)",1.0,3.992817,13.94276,df_train
5,threat,0,159569,int64,2,,,,,0.002996,0.0,"(0, 159093)",1.0,18.189001,328.84389,df_train
6,insult,0,159569,int64,2,,,,,0.049364,0.0,"(0, 151694)",1.0,4.16054,15.310284,df_train
7,identity_hate,0,159569,int64,2,,,,,0.008805,0.0,"(0, 158166)",1.0,10.515923,108.585989,df_train
8,id,0,0,object,153164,1.0,21.0,16.000346,16.0,,,,,,,df_test_data
9,comment_text,0,0,object,153164,1.0,5000.0,367.484899,182.0,,,,,,,df_test_data


### Column Dictionary:

1. id: a unique identifier for that comment.
2. comment_text: a string containing an example comment to train the model on.


3. toxic: a binary numerical identifier to state if the comment contains malicious content.

4. severe_toxic: a binary numerical identifier to state if the comment contains highly offensive malicious content.

5. obscene: a binary numerical identifier to state if the comment contains curse words or not.

6. threat: a binary numerical identifier to state if the comment contains a threat or not.

7. insult: a binary numerical identifier to state if the comment contains a personal insult or not.

8. identity_hate: a binary numerical identifier to state if the comment contains offensive material based on the recipients characteristics, e.g. racism, sexism, etc.

In [5]:
# Training and test data look fine, but why do test_labels have 3 unique values instead of 2? 
for col in df_test_labels:
    unique_values = df_test_labels[col].unique()
print(f'First check: ', unique_values)

# Aha, it follows a different convention. We need to clean this up to ensure it matches the others.
# After a quick visual inspection, it seems the -1s are for positive (non-toxic) classifications.
# I will change these to 0, to match the conventions of the training data.
df_test_labels[df_test_labels == -1] = 1
for col in df_test_labels:
    unique_values = df_test_labels[col].unique()
print(f'Second check: ', unique_values)

First check:  [-1  0  1]
Second check:  [0 1]


In [6]:
# Redo integrity check:
df_test_labels_result = data_integrity_check(df_test_labels, title='df_test_labels')
df_test_labels_result

Unnamed: 0,Column,Null Values,Duplicate Values,Data Type,Unique Values,Min Text Length,Max Text Length,Mean Text Length,Median Text Length,Mean,Median,Mode,Range,Skew,Kurtosis,Source
0,id,0,0,object,153164,1.0,21.0,16.000346,16.0,,,,,,,df_test_labels
1,toxic,0,153162,int64,2,,,,,0.039761,0.0,"(0, 147074)",1.0,4.710831,20.192188,df_test_labels
2,severe_toxic,0,153162,int64,2,,,,,0.002396,0.0,"(0, 152797)",1.0,20.355616,412.356502,df_test_labels
3,obscene,0,153162,int64,2,,,,,0.024098,0.0,"(0, 149473)",1.0,6.206614,36.522538,df_test_labels
4,threat,0,153162,int64,2,,,,,0.001378,0.0,"(0, 152953)",1.0,26.887009,720.920687,df_test_labels
5,insult,0,153162,int64,2,,,,,0.022375,0.0,"(0, 149737)",1.0,6.458872,39.71754,df_test_labels
6,identity_hate,0,153162,int64,2,,,,,0.004649,0.0,"(0, 152452)",1.0,14.564574,210.129547,df_test_labels


In [7]:
# What are the feature classes like?
df_train[df_train.columns[2:]].value_counts()

toxic  severe_toxic  obscene  threat  insult  identity_hate
0      0             0        0       0       0                143346
1      0             0        0       0       0                  5666
                     1        0       1       0                  3800
                                      0       0                  1758
                     0        0       1       0                  1215
       1             1        0       1       0                   989
       0             1        0       1       1                   618
0      0             1        0       0       0                   317
                     0        0       1       0                   301
1      1             1        0       1       1                   265
0      0             1        0       1       0                   181
1      1             1        0       0       0                   158
       0             0        0       0       1                   136
                              

Okay that seems to have done the job. Worth noting a major class imbalance between 16,225 toxic and 143,346 non-toxic scoring comments which might bias the training of models. How do the other two datasets compare, now that we think about it?

In [8]:
# Checking class balance in test_labels:
df_test_labels[df_test_labels.columns[1:]].value_counts()

toxic  severe_toxic  obscene  threat  insult  identity_hate
0      0             0        0       0       0                146921
1      0             1        0       1       0                  1932
                     0        0       0       0                  1710
                     1        0       0       0                   854
                     0        0       1       0                   539
                     1        0       1       1                   361
       1             1        0       1       0                   176
                                              1                   116
       0             0        0       1       1                    81
                                      0       1                    67
                     1        1       1       0                    65
0      0             0        0       1       0                    64
1      0             0        1       0       0                    50
0      0             1        

The test data labels seem to have the same issue, far more non-toxic than toxic comments. This is something to keep in mind when training or doing classification metrics.

In [9]:
df_train['comment_text']

0         Explanation\r\nWhy the edits made under my use...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\r\nMore\r\nI can't make any real suggestions...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \r\n\r\nThat...
159568    Spitzer \r\n\r\nUmm, theres no actual article ...
159569    And it looks like it was actually you who put ...
159570    "\r\nAnd ... I really don't think you understa...
Name: comment_text, Length: 159571, dtype: object

In [10]:
# Split dataset into data (X) and features to predict (y):
X = df_train['comment_text']
y = df_train[df_train.columns[2:]].values

# Now we need to tokenize & vectorize everything, using NLTK and TF-IDF:
X_processed = feature_vectorizer(X)



That feature_vectorizer function is designed to preprocess a corpus of text data by tokenizing it, removing stopwords, and generating a document-term matrix using the CountVectorizer from scikit-learn. It first defines a list of stopwords and a whitelist of exceptions, then removes stopwords from the corpus while preserving those in the whitelist. 
The function utilizes the CountVectorizer with specific configurations such as lowercase conversion, tokenization using NLTK's word_tokenize function, minimum document frequency of 2, and unigram to bigram n-gram range. Additionally, it applies the TF-IDF transformation to the resulting document-term matrix to weigh the importance of terms in the corpus. Finally, it returns the processed corpus in its vectorized form suitable for further analysis or modeling tasks like we want to do here.

In [11]:
# Check vectorizer worked as intended:
X_processed.shape

(159571, 678726)

This should now represent a document-term matrix with 159,571 rows and 678,726 features produced from TF-IDF. 

TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents. It has many uses, most importantly in automated text analysis, and is very useful for scoring words in machine learning algorithms for Natural Language Processing (NLP).

However it is probably worth doing some dimensionality reduction because X_processed has an enormous dimensionality at 159,571 rows and 678,726 columns! 
Dimensionality reduction is crucial in NLP for coping with high-dimensional data, typical of large vocabularies or intricate word embeddings. This helps us to handle the "curse of dimensionality" which refers to challenges with high-dimensional data, including data sparsity which affects our ability to make statistical inferences, increased computational complexity which bloats execution times, and heightened risk of overfitting which leads to poor model performance on new data. By compressing the feature space while preserving essential information, these techniques improve computational efficiency, enhance model generalization, and facilitate insightful data exploration and interpretation.

For this use case, I suggest Latent Semantic Analysis (LSA) which is an unsupervised learning technique to extract and represent the underlying semantic structure of textual data by constructing a mathematical model of the relationships between terms and documents in a corpus. 

LSA uses a matrix factorization method, typically Singular Value Decomposition (SVD), to decompose a term-document matrix into lower-dimensional representations. In the context of LSA and natural language processing, SVD is commonly applied to decompose the term-document matrix into three matrices: the left singular vectors matrix, the diagonal singular values matrix, and the right singular vectors matrix. These matrices represent the relationships between terms and documents in a lower-dimensional space, where the singular values capture the importance of each dimension. 

SVD enables dimensionality reduction by retaining only the most significant singular values and their corresponding singular vectors, thus revealing the underlying latent semantic structure of the data. These representations, known as latent semantic dimensions, capture the latent relationships between terms and documents.

In [12]:
# Instantiate Latent Semantic Analysis:
lsa_tool = TruncatedSVD(n_components=400, n_iter=100, random_state=123)
# The above will reduce X_vectorized from 678,726 to 400 features.
# We can always return to modify this later on if needed or wanted.

# Fit and transform the vectorized data
X_lsa = lsa_tool.fit_transform(X_processed)

In [13]:
# Inspect output:
X_lsa.shape

(159571, 400)

In [14]:
# Looks good, now let's save it for convenience.
df_lsa = pd.DataFrame(X_lsa)
df_lsa.to_csv('lsa_train_output.csv', index=False)
print(f'LSA Output on training data X_processed converted to Pandas dataframe and saved as CSV.')

LSA Output on training data X_processed converted to Pandas dataframe and saved as CSV.


In [15]:
# Okay, let's do the test data too:
X = df_test_data['comment_text']
y = df_test_labels[df_test_labels.columns[1:]]

# Run the vectorizer again:
X_test_processed = feature_vectorizer(X)

# Need to ensure test data has same feature shape as training data.
# So we repeat LSA for dimensionality reduction:
X_test_lsa = lsa_tool.fit_transform(X_test_processed)

# Inspect output:
X_test_lsa.shape

(153164, 400)

In [16]:
# Looks good, now let's save it for convenience.
df_test_lsa = pd.DataFrame(X_test_lsa)
df_test_lsa.to_csv('lsa_test_output.csv', index=False)
print(f'LSA Output on test data X_processed converted to Pandas dataframe and saved as CSV.')

LSA Output on test data X_processed converted to Pandas dataframe and saved as CSV.
