**Toxic Commend Classification - Processing Test Dataset**

- Understanding the test dataset
- Processing the test dataset, ready for model validation use

<div style="border-top: 7px solid #800080; animation: sparkling 2s linear infinite;"></div>

<style>
@keyframes sparkling {
  0% { background-position: 0 0; }
  100% { background-position: 100% 0; }
}
</style>

In [13]:
import pandas as pd
import spacy
import re

In [14]:
# Load the language model
nlp = spacy.load("en_core_web_lg")

In [15]:
test_df = pd.read_csv('test.csv')
test_label = pd.read_csv('test_labels.csv')

In [16]:
test_label.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,153164.0,153164.0,153164.0,153164.0,153164.0,153164.0
mean,-0.54253,-0.579895,-0.558193,-0.580913,-0.559916,-0.577642
std,0.572465,0.498408,0.542966,0.496195,0.539594,0.50326
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
test_df

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [18]:
test_df = pd.merge(test_df, test_label, on="id")

In [19]:
test_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",-1,-1,-1,-1,-1,-1


In [20]:
columns = list(test_label.columns.drop("id"))

In [21]:
# Count the number of correctly labeled cases
correctly_labeled_cases = (test_df[columns].eq(1).any(axis=1)).sum()

# Count the number of not labeled cases
not_labeled_cases = (test_df[columns].eq(0).all(axis=1)).sum()

# Count the number of cases marked with -1
marked_with_minus_one_cases = (test_df[columns].eq(-1).all(axis=1)).sum()

print("Total number of cases correctly labeled:", correctly_labeled_cases)
print("Total number of cases not labeled:", not_labeled_cases)
print("Total number of cases marked with -1:", marked_with_minus_one_cases)

Total number of cases correctly labeled: 6243
Total number of cases not labeled: 57735
Total number of cases marked with -1: 89186


In [22]:
# Remove the cases which were marked as -1 as they cannot be used for evaluation
test_df = test_df.drop(test_df[test_df[columns].eq(-1).all(axis=1)].index)

In [23]:
test_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [24]:
# Run the same NLP pipeline as for the training set on comment_text to reprocess the test data
token_nonstop = []
lemmas = []
number_tokens = []
number_sentences = []

for text in test_df["comment_text"].astype("unicode").values:
    # Preprocessing steps
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    text = re.sub(r'[\!:\?\-\'\"\\/]', r'', text)
    text = re.sub('[\s]+', ' ', text)

    # Process text with spaCy
    doc = nlp(text)
    if doc.has_annotation("DEP"):
        token_nonstop.append([t.text for t in doc if t.is_alpha and not t.is_stop])
        lemmas.append([t.lemma_ for t in doc if t.is_alpha and not t.is_stop])
        number_tokens.append(len([t for t in doc if t.is_alpha and not t.is_stop]))
        number_sentences.append(sum(1 for _ in doc.sents))
    else:
        token_nonstop.append(None)
        lemmas.append(None)
        number_tokens.append(None)
        number_sentences.append(None)

test_df['token_nonstop'] = token_nonstop
test_df['lemmas'] = lemmas
test_df['number_tokens'] = number_tokens
test_df['number_sentences'] = number_sentences

In [25]:
test_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,token_nonstop,lemmas,number_tokens,number_sentences
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,"[thank, understanding, think, highly, revert, ...","[thank, understanding, think, highly, revert, ...",6,2
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,"[dear, god, site, horrible]","[dear, god, site, horrible]",4,1
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"[somebody, invariably, try, add, religion, mea...","[somebody, invariably, try, add, religion, mea...",36,2
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,"[says, right, type, type, institution, needed,...","[say, right, type, type, institution, need, ca...",42,5
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,"[adding, new, product, list, sure, relevant, a...","[add, new, product, list, sure, relevant, add,...",24,3
...,...,...,...,...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0,"[jerome, got, surprised, looked, example, nomi...","[jerome, get, surprised, look, example, nomine...",36,4
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0,"[lucky, heh, famous, kida, envy, congrats]","[lucky, heh, famous, kida, envy, congrat]",6,3
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0,"[shame, want, speak, gays, romanians]","[shame, want, speak, gay, romanian]",5,1
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0,"[mel, gibson, nazi, bitch, makes, shitty, movi...","[mel, gibson, nazi, bitch, make, shitty, movie...",12,2


In [26]:
test_df.to_csv('ready_test.csv', index=False)