# Install Libraries

In [1]:
!pip install sklearn -q
!pip install spacy -q
!python -m spacy download en_core_web_sm

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn import metrics

import string
import spacy
np.random.seed(42)

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Datasets/toxic.csv", nrows=1000)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Drop unnecessary columns
data.drop(["id",'severe_toxic', 'obscene', "threat", "insult", "identity_hate" ], axis=1)

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
995,""" Hi, Writingrights, Welcome to Wikipedia! \n...",0
996,It is common knowledge that Karaims (but not K...,0
997,", 12 April 2006 (UTC)\nThen rewrite and expand...",0
998,"""I was trying to inject some humour (as eviden...",0


In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'out', 'what', 'here', 'hereupon', 'our', 'moreover', 'now', '’ve', 'indeed', 'someone', 'their', 'above', 'ten', 'a', 'all', 'side', 'anyhow', 'becoming', 'too', 'one', 'done', 'hundred', 'alone', 'can', 'several', 'are', 'were', 'onto', 'twelve', 'bottom', 'through', 'hence', 'cannot', 'yourself', 'do', 'so', 'my', 'put', 'thru', 'them', 'give', 'anywhere', 'these', 'together', 'then', 'front', 'either', 'sometime', 'nine', 'still', 'another', 'therein', 'further', 'perhaps', 'made', 'serious', 'who', 'already', 'few', 'please', 'towards', 'very', '’re', 'its', 'will', 'say', 'with', 'between', 'was', 'rather', 'no', 'otherwise', 'part', 'six', 'whereby', 'least', 'throughout', 'each', 'else', 'hers', 'last', 'from', 'along', 'three', 'two', 'both', 'beside', 'used', 'show', 'some', 'afterwards', 'those', 'due', 'am', 'per', 'never', 'whole', 'anyway', 'much', 'his', '‘re', 'ca', 'fifty', 'thereupon', 'would', 'anything', 'although', 'about', 'nevertheless', 'it', 'wherever', 'eight

In [7]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
#creating our tokenizer functions
def spacy_tokenizer(sentence):
  doc = nlp(sentence)

  print(doc)

  # Lemmatizing each token and converting each token into lowercase
  mytokens = [word.lemma_.lower().strip() for word in doc]

  print(mytokens)

  #Removing stop words

  mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

  return mytokens

In [9]:
sentence = "I am eating apple ?"
spacy_tokenizer(sentence)

I am eating apple ?
['i', 'be', 'eat', 'apple', '?']


['eat', 'apple']

In [10]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer, min_df=1)
#tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [11]:
count_vector.fit_transform(["I am eating apple, I like apple", "I am playing cricket"]).toarray()
print()
#tfidf_vector.fit_transform(["I am eating apple, I like apple", "I am playing cricket"]).toarray()

i am eating apple, i like apple
['i', 'be', 'eat', 'apple', ',', 'i', 'like', 'apple']
i am playing cricket
['i', 'be', 'play', 'cricket']



In [12]:
count_vector.get_feature_names_out()

array(['apple', 'cricket', 'eat', 'like', 'play'], dtype=object)

In [13]:
count_vector.vocabulary_

{'eat': 2, 'apple': 0, 'like': 3, 'play': 4, 'cricket': 1}

# Model Building

In [14]:
from sklearn.model_selection import train_test_split

X = data["comment_text"] #the features we want to analyze
y = data["toxic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train_vectors = count_vector.fit_transform(X_train)
X_test_vectors = count_vector.transform(X_test)

classifier = LogisticRegression()

there are many broken nostrils in this pancreas
['there', 'be', 'many', 'broken', 'nostril', 'in', 'this', 'pancrea']
"
i fear jossi may be correct. if watts (who, btw, died in 1973) said this about rawat, he probably meant it as praise; i.e. the emphasis would have been on ""sacred"", judging from my knowledge of watts' oeuvre. he was rather anti-intellectual himself. 466 "
['"', '', 'i', 'fear', 'jossi', 'may', 'be', 'correct', '.', 'if', 'watt', '(', 'who', ',', 'btw', ',', 'die', 'in', '1973', ')', 'say', 'this', 'about', 'rawat', ',', 'he', 'probably', 'mean', 'it', 'as', 'praise', ';', 'i.e.', 'the', 'emphasis', 'would', 'have', 'be', 'on', '"', '"', 'sacred', '"', '"', ',', 'judge', 'from', 'my', 'knowledge', 'of', 'watts', "'", 'oeuvre', '.', 'he', 'be', 'rather', 'anti', '-', 'intellectual', 'himself', '.', '466', '"']
"  and if he were a mere ""fellow editor,"" i wouldn't have any problem with him.  but since he seems to let the power of being an admin go to his head, i have 

In [16]:
X_train_vectors.shape

(700, 6529)

In [17]:
classifier.fit(X_train_vectors, y_train)
predicted = classifier.predict(X_test_vectors)


In [18]:

print("Logisitc Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logisitc Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logisitc Regression Recall:", metrics.recall_score(y_test, predicted))
print("Logisitc Regression F1 score:", metrics.f1_score(y_test, predicted))

Logisitc Regression Accuracy: 0.92
Logisitc Regression Precision: 1.0
Logisitc Regression Recall: 0.22580645161290322
Logisitc Regression F1 score: 0.3684210526315789
