# Installing model from hugging face


In [1]:
from transformers import pipeline, Conversation
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import numpy as np


In [3]:
# Load train empathetic dialogues dataset
dataset = pd.read_csv("train_preprocessed.csv")

In [4]:
# Data cleaning
dataset=dataset.dropna()
dataset


Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity
0,explanation why the edits made under my userna...,0000997932d777bf,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
1,d aww he matches this background colour i m s...,000103f0d9cfb60f,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
2,hey man i m really not trying to edit war it...,000113f07ec002fd,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
3,more i can t make any real suggestions on im...,0001b41b1c6bb37e,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
4,you sir are my hero any chance you remember...,0001d958c54c6e35,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
159566,and for the second time of asking when your ...,ffe987279560d7ff,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
159567,you should be ashamed of yourself that is a ho...,ffea4adeee384e90,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
159568,spitzer umm theres no actual article for pros...,ffee36eab5c267c9,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
159569,and it looks like it was actually you who put ...,fff125370e4aaaf3,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0


In [5]:
# Separate features and target labels
X = dataset['comment_text']
y = dataset['toxic']

In [6]:
vect = CountVectorizer()
vect.fit(X)

In [7]:
print("Vocab size: {}".format(len(vect.vocabulary_)))
print("Vocab content:\n {}".format(vect.vocabulary_))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Train Test Split


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

## Text Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

In [10]:
print(X_train_bow.shape)
print(X_test_bow.shape)

(127656, 19539)
(31915, 19539)


# SMV

In [None]:
# For Calculating model smv score
from sklearn import svm

model_svm = svm.SVC(C=8.0, kernel='linear')
model_svm.fit(X_train_bow, y_train)

In [None]:
print(model_svm.score(X_test_bow, y_test))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(model_svm, X_test_bow, y_test, normalize='all')

## Decision Tree

In [None]:
# For Calculating model dec score
from sklearn.tree import DecisionTreeClassifier

model_dec = DecisionTreeClassifier(max_depth=10, random_state=0)
model_dec.fit(X_train_bow, y_train)

In [None]:
print(model_dec.score(X_test_bow, y_test))


In [None]:
ConfusionMatrixDisplay.from_estimator(model_dec, X_test_bow.toarray(), y_test, normalize='all')

## Naive Bayes

In [None]:
# For calculating model gnb score
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()
model_gnb.fit(X_train_bow.toarray(), y_train)

In [None]:
print(model_gnb.score(X_test_bow.toarray(), y_test))


In [None]:
ConfusionMatrixDisplay.from_estimator(model_gnb, X_test_bow.toarray(), y_test, normalize='all')

## Logistic Regression

In [None]:
# For Calculating model lg score
from sklearn.linear_model import LogisticRegression

model_lg = LogisticRegression()
model_lg.fit(X_train_bow, y_train)

In [None]:
print(model_lg.score(X_test_bow, y_test))

In [None]:
ConfusionMatrixDisplay.from_estimator(model_lg, X_test_bow.toarray(), y_test, normalize='all')

# Tesing with self-created comment

In [None]:
## try a whole new self-created comment:)
new_review =['This movie is so so',
             'This movie looks good',
             'Love it and hate it',
             'I want to see it one more time',
             'Hate it and love it']
new_review_bow = tfidf_vec.transform(new_review)

model_svm.predict(new_review_bow)


In [None]:
res = tfidf_vec.transform(["damn you are such a looser"])
model_svm.predict(res)

# Toxic recognize program using HUGGING FACE model

In [None]:
# Running API from hugging face|
import requests

API_URL = "https://api-inference.huggingface.co/models/JungleLee/bert-toxic-comment-classification"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"inputs": "I like you. I love you",
})

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("JungleLee/bert-toxic-comment-classification")
model = AutoModelForSequenceClassification.from_pretrained("JungleLee/bert-toxic-comment-classification")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("text-classification", model="JungleLee/bert-toxic-comment-classification")


In [None]:
def toxic_recognize(input):
  check = classifier(input)
  return check

In [None]:
def comment_verify():
  comment = input("Input your comment: ")
  result = toxic_recognize(comment)
  return result

In [None]:
comment_verify()