<a href="https://colab.research.google.com/github/MartaCampagnoli/HateSpeechDetection/blob/main/No%20Output%20Notebooks/Classification_Part1_MultiClass_NoOutput.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing libraries
from gensim.models import Word2Vec
from google.colab import files
import io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string
from string import punctuation
import xgboost as xgb
import gensim.downloader
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
#personalized set of stopwords
stopwords = nltk.corpus.stopwords.words('english')
exceptions = ["no", "not" , "don't", "they", "them"]
stop = [word for word in stopwords if word not in exceptions]
#stemmer
stemmer = nltk.SnowballStemmer("english")

In [None]:
#preprocessing function
def preprocess(text):
    text = ''.join([word for word in text])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop]
    tokens = [stemmer.stem(w) for w in tokens]
    return ' '.join(tokens)

In [None]:
uploaded = files.upload() #sixcat.csv

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['sixcat.csv'])) #sixcat

In [None]:
train, test = train_test_split(df ,test_size=0.30, random_state = 42)

In [None]:
#setting training and test
X_train, X_test, y_train, y_test = train['text'], test['text'], train['target'], test['target']

In [None]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [None]:
#tf idf
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(X_train)
X_test_tf = tf_idf.transform(X_test)

In [None]:
#for word2vec
sentences = [sentence.split() for sentence in X_train]

## XGBoost TFIDF

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train_tf, y_train)

In [None]:
y_pred = model.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

## Logistic Regression TFIDF

In [None]:
model = LogisticRegression(solver='sag', verbose = 1)
model.fit(X_train_tf, y_train)

In [None]:
y_pred = model.predict(X_test_tf)
print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

## SGD TFIDF

In [None]:
#sgd
sgd = SGDClassifier(random_state=5)
sgd.fit(X_train_tf, y_train)
y_pred = sgd.predict(X_test_tf)

print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

## Word2vec: training the embeddings

In [None]:
w2v_model = Word2Vec(sentences, vector_size=300, window=2, min_count=10, workers=4)

In [None]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
X_trainw2c = np.array([vectorize(sentence) for sentence in X_train])
X_testw2c = np.array([vectorize(sentence) for sentence in X_test])

In [None]:
#xgboost
model = xgb.XGBClassifier()
model.fit(X_trainw2c, y_train)

In [None]:
y_pred = model.predict(X_testw2c)
print(metrics.classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

In [None]:
#sgd
sgd = SGDClassifier(random_state=5)
sgd.fit(X_trainw2c, y_train)
y_pred = sgd.predict(X_testw2c)

print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

## Word2Vec: pretrained embeddings

In [None]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
w2vvectors =  gensim.downloader.load('word2vec-google-news-300')
textfast = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
def vectorizew2c(sentence):
    words = sentence.split()
    words_vecs = [w2vvectors[word] for word in words if word in w2vvectors]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

def vectorizetextfast(sentence):
    words = sentence.split()
    words_vecs = [textfast[word] for word in words if word in textfast]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
#word2vec

In [None]:
X_trainvec = np.array([vectorizew2c(sentence) for sentence in X_train])
X_testvec = np.array([vectorizew2c(sentence) for sentence in X_test])

In [None]:
model = xgb.XGBClassifier()
model.fit(X_trainvec, y_train)

In [None]:
y_pred = model.predict(X_testvec)
print(metrics.classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')

In [None]:
#FastText

In [None]:
X_traintext = np.array([vectorizetextfast(sentence) for sentence in X_train])
X_testtext = np.array([vectorizetextfast(sentence) for sentence in X_test])

In [None]:
model = xgb.XGBClassifier()
model.fit(X_traintext, y_train)
y_pred = model.predict(X_testtext)
print(metrics.classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'summer')