<a href="https://colab.research.google.com/github/KTx735/Comment-Classifier/blob/main/Comment_Classifier_Final_Project_SLP_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Info

-- train.csv - the training set, contains comments with their binary labels

-- test.csv - the test set, you must predict the toxicity probabilities for these comments. To deter hand labeling, the test set contains some comments which are not included in scoring.

-- sample_submission.csv - a sample submission file in the correct format

-- test_labels.csv - labels for the test data; value of -1 indicates it was not used for scoring; (Note: file added after competition close!)


#Useful Resources

https://www.geeksforgeeks.org/snowball-stemmer-nlp/

https://stackoverflow.com/questions/51420032/using-saved-sklearn-model-to-make-prediction

https://docs.python.org/3/library/pickle.html

https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html

# Import

In [None]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stopwords_english = stopwords.words('english')
sn = SnowballStemmer(language='english')

#Train and Test


In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [None]:
print("Train Data:", len(train_data))
print("Test Data:", len(test_data))
print("Test Lables:", len(test_labels))

Train Data: 159571
Test Data: 153164
Test Lables: 153164


In [None]:
train_data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Get Percentage of Toxicity
comments = train_data.drop(['id','comment_text'],axis = 1)
for i in comments.columns :
    print("Percent of {0}s: ".format(i), round(100*comments[i].mean(),2), "%")

Percent of toxics:  9.58 %
Percent of severe_toxics:  1.0 %
Percent of obscenes:  5.29 %
Percent of threats:  0.3 %
Percent of insults:  4.94 %
Percent of identity_hates:  0.88 %


# Clean Text

In [None]:
# This cleans the abbreviations that could be in the language
def  clean_text(text):
    text =  text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    return text

In [None]:
train_data.comment_text = train_data.comment_text.apply(clean_text)
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he matches this background colour i am se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on im...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [None]:
def stemmer(text):
    words =  text.split()
    train = [sn.stem(word) for word in words if not word in set(stopwords.words('english'))]
    return ' '.join(train)

In [None]:
train_data.comment_text = train_data.comment_text.apply(stemmer)
train_data.comment_text.head()

0    explan edit made usernam hardcor metallica fan...
1    daww match background colour seem stuck thank ...
2    hey man realli tri edit war guy constant remov...
3    cannot make real suggest improv wonder section...
4                           sir hero chanc rememb page
Name: comment_text, dtype: object

GPU runtime = 28 min 58 sec

In [None]:
x =  train_data.comment_text
y =  train_data.drop(['id','comment_text'],axis = 1)

In [None]:
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size = 0.2,random_state = 45)

In [None]:
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(x_train)    
train_word_features = word_vectorizer.transform(x_train)

In [None]:
pickle.dump(word_vectorizer, open('vectorizer.sav', 'wb'))
vectorize = pickle.load(open('vectorizer.sav', 'rb'))

In [None]:
X_train_transformed = word_vectorizer.transform(x_train)
X_test_transformed = word_vectorizer.transform(x_test)

In [None]:
def make_test_predictions(df,classifier):
    df.comment_text = df.comment_text.apply(clean_text)
    df.comment_text = df.comment_text.apply(stemmer)
    X_test = df.comment_text
    X_test_transformed = vectorize.transform(X_test)
    y_test_pred = loaded_model.predict_proba(X_test_transformed)
    result =  sum(y_test_pred[0])
    if result >=1 :
       return("Offensive Comment")
    else :
       return ("Normal Comment")

In [None]:
logistic_regression = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=45)

classifier = OneVsRestClassifier(logistic_regression)
classifier.fit(X_train_transformed, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=10, random_state=45,
                                                 solver='liblinear'))

In [None]:
pred = classifier.predict(X_test_transformed)
accuracy = accuracy_score(y_test, pred)
print(f'Accuracy {accuracy:.4}')

precision = precision_score(y_test, pred, average='micro')
print(f'Precision {precision:.4}')

recall = recall_score(y_test, pred, average='micro')
print(f'Recall {recall:.4}')

Accuracy 0.9194
Precision 0.7924
Recall 0.6753


In [None]:
pickle.dump(classifier, open('classifier.sav', 'wb'))

loaded_model = pickle.load(open('classifier.sav', 'rb'))

In [None]:
def text_analyzer(text):
  print("Comment:", text)
  comment_text = text
  comment ={'id':[565],'comment_text':[comment_text]}
  comment = pd.DataFrame(comment)
  result = make_test_predictions(comment,loaded_model)
  print(result)

In [None]:
text_analyzer(test_data['comment_text'][1358])

Comment: Thank you so much for the encouragement. I would surely be cautious but if I am sure I may remove the school that is not affiliated to the Society.   •
Normal Comment


In [None]:
text_analyzer("You are a piece of shit")

Comment: You are a piece of shit
Offensive Comment


In [None]:
text_analyzer(test_data['comment_text'][154])

Comment: == Don't feed the troll == 

 Responding with taunting is exactly the wrong way to respond.  Don't feed the troll.  Remove the post without comment, or don't do anything at all.
Normal Comment


In [None]:
text_analyzer(test_data['comment_text'][58])

Comment: == Can you work your magic? == 

 Hi.  I was wondring if you had time to help out with Wikipedia:Featured article candidates/W. E. B. Du Bois/archive1.  Any input would be appreciated:  a quick scan or a full review.  I've noticed your work before and I respect your skills. Thanks.
Normal Comment


In [None]:
text_analyzer("I will fucking kill you!")

Comment: I will fucking kill you!
Offensive Comment
