In [15]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
tf.random.set_seed(123)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martynasgiedraitis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
df=pd.read_csv('labeled_data.csv')
df = df[df['class'] != 1]
df.head()
len(df)
df['class'] = df['class'].replace(2, 1)
df['class'].unique()
df.head()
#0 - hate speech, 1 - neither

splits = {'train': 'data/train-00000-of-00001.parquet'}
test_data = pd.read_parquet("hf://datasets/abdulrub/hate_speech_dataset/" + splits["train"])
test_data.columns=["tweet","class"]
test_data["class"] = 1 - test_data["class"]


In [None]:
def preprocess_text(text):
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not') #ar reikia?
    svarus_tekstas=[]
    for tweetas in text:
        tweet=str(tweetas).lower()
        tweet=re.sub(r"@[A-Za-z0-9_]+", " ", tweetas)
        tweet=re.sub('RT', ' ', tweet)
        tweet=re.sub(r"https?://[A-Za-z0-9./]+", " ", tweet)
        tweet= re.sub(r"https?", " ", tweet)
        tweet=re.sub(r"[^a-zA-Z]", " ", tweet) #viskas kas ne raide
        tweet=tweet.split()
        ps=PorterStemmer()
        tweet=[ps.stem(word) for word in tweet if not word in set(all_stopwords) if len(word)>2]
        tweet=' '.join(tweet)
        svarus_tekstas.append(tweet)
    return svarus_tekstas

df = df[['class', 'tweet']]
df.dropna(inplace=True)
df['tweet']=preprocess_text(df['tweet'])
df.head()
df = df.sample(frac=1,random_state=123).reset_index(drop=True)
print(df['class'].value_counts())

test_data.dropna(inplace=True)
test_data["tweet"]=preprocess_text(test_data["tweet"])
test_data= test_data.sample(frac=1,random_state=123).reset_index(drop=True)
print(test_data['class'].value_counts())


class
1    4163
0    1430
Name: count, dtype: int64
class
0    2000
1    2000
Name: count, dtype: int64


In [18]:
max_feature=20000 #kiek dazniausiu unikaliu zodziu imama
max_text_length=50 #maksimalus zodziu skaicius viename tweet'e
d=300 #kiek komponentu turi zodzio vektorius, typically more dimensions = greater quality encoding, but there will be some limit beyond which you'll get diminishing returns. We typically use 200 or 300.
x_tokenizer = Tokenizer(num_words=max_feature) #tokenizatorius
x_tokenizer.fit_on_texts(df['tweet']) #skaiciuoja kiek kievienas zodis pasikartoja, sukuria indeksa zodziui
X = x_tokenizer.texts_to_sequences(df['tweet']) #kiekvienas tweetas paverciamas i skaiciu seka, rezultatas yra list of list
X = pad_sequences(X, maxlen=max_text_length, padding='post', truncating='post') #paduoda i viena masyva, prideda nulius iki max_text_length
print(X)
# word 2 vec semantiniai vektoriai zodziams, panasus zodziai arti tiesineje vektoriu erdvejee, leidzia suprasti zodziu reiksmes
nlp = spacy.load("en_core_web_md")
index2word = {v: k for k, v in x_tokenizer.word_index.items() if v < max_feature} #sukuriamas zodzio ir jo indekso zodynas
X_embedded = []
for sak in X:
    vektoriai = []
    for zodis in sak:
        if zodis == 0:
            vektoriai.append(np.zeros(d)) #paddingui
            continue
        word=index2word[zodis] #gaunamas zodis is indekso
        vektoriai.append(nlp(word).vector) #gaunamas zodzio vektorius
    X_embedded.append(vektoriai)
X_embedded = np.array(X_embedded)
print(X_embedded)

y = df['class'].values

X_train, X_val, y_train, y_val = train_test_split(
    X_embedded, y, test_size=0.2, random_state=123, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)



#test aibe:
X_test = x_tokenizer.texts_to_sequences(test_data['tweet'])
X_test = pad_sequences(X_test, maxlen=max_text_length, padding='post', truncating='post')
X_test_embedded = []
for sak in X_test:
    vektoriai = []
    for zodis in sak:
        if zodis == 0:
            vektoriai.append(np.zeros(d))
            continue
        word = index2word.get(zodis, None)
        if word is None:
            vektoriai.append(np.zeros(d))   
        else:
            vektoriai.append(nlp(word).vector)
    X_test_embedded.append(vektoriai)

X_test_embedded = np.array(X_test_embedded)
y_test = test_data['class'].values

[[  13 1638  166 ...    0    0    0]
 [  95  514  929 ...    0    0    0]
 [  29  235  155 ...    0    0    0]
 ...
 [ 918  139 2567 ...    0    0    0]
 [ 452 2432 1497 ...    0    0    0]
 [ 182  334   43 ...    0    0    0]]
[[[-0.97632003 -0.49831    -0.36779001 ...  0.23633    -0.48117
    0.17076001]
  [-0.63909    -0.40753999  0.49182999 ...  0.20452    -0.34534001
    0.14707001]
  [-0.65437001  0.1222      0.087843   ...  0.21882001 -0.063339
   -0.11628   ]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[-0.72733998  0.018833   -0.14196999 ... -0.083337    0.096124
   -0.027416  ]
  [-0.66346997 -0.43516001 -0.57458001 ...  0.59784001 -0.28667
    0.57213998]
  [-0.76090002 -0.10327     0.13736001 ...  0.03336     0.030155
    0.98853999]
  ...
  [ 0.          0.          0.         ...  0. 

In [19]:
from sklearn.utils import class_weight
svoriai_modeliams= class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['class']),
    y=df['class']
)
svoriai_modeliams = dict(enumerate(svoriai_modeliams))
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict) 


{0: 1.9554195804195804, 1: 0.6717717717717717}


In [20]:
model=Sequential()
model.add(Dropout(0.2, input_shape=(max_text_length, d)))
model.add(Conv1D(64,2,padding='valid',activation='relu'))
model.add(MaxPooling1D()) #sumazina zodziu skaiciu per puse
model.add(Conv1D(64,2,padding='valid',activation='relu'))
model.add(MaxPooling1D())

model.add(Conv1D(32,2,padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(32,2,padding='valid',activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))
model.summary()

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

history = model.fit(X_train,y_train, class_weight=class_weights_dict, validation_data=(X_val, y_val), 
                    epochs=100, callbacks=EarlyStopping(monitor='val_loss', patience=6,restore_best_weights=True), batch_size=64)

test_loss, test_acc = model.evaluate(X_test_embedded, y_test)
print("Test accuracy:", test_acc)
label_names = {0: "hate speech", 1: "neutral"}
y_pred_probs = model.predict(X_test_embedded)
y_pred = np.argmax(y_pred_probs, axis=1)
for i in range(100):  
    print(f"Sakinys: {test_data['tweet'].iloc[i]}")
    print(f"Prognozuota klasė: {label_names[y_pred[i]]}")
    print(f"Tikroji klasė:     {label_names[y_test[i]]}")
    print("-" * 60)

  super().__init__(**kwargs)


Epoch 1/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.7698 - loss: 0.4851 - val_accuracy: 0.8803 - val_loss: 0.3265
Epoch 2/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.8813 - loss: 0.3151 - val_accuracy: 0.8883 - val_loss: 0.2953
Epoch 3/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8932 - loss: 0.2741 - val_accuracy: 0.8865 - val_loss: 0.3033
Epoch 4/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9032 - loss: 0.2430 - val_accuracy: 0.8954 - val_loss: 0.2908
Epoch 5/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9099 - loss: 0.2344 - val_accuracy: 0.8901 - val_loss: 0.3023
Epoch 6/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9146 - loss: 0.2190 - val_accuracy: 0.8954 - val_loss: 0.2875
Epoch 7/100
[1m70/70[0m [

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as Pipeline
from collections import Counter
from sklearn.svm import LinearSVC

svoriai=dict(zip(df['class'].unique(),svoriai_modeliams.values()))
lr = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,5))),
    ("clf", LogisticRegression(
        solver="liblinear",
        class_weight=svoriai,
        max_iter=2000
    ))
])
lr.fit(df['tweet'], df['class'])
test_data_accuracy = lr.score(test_data['tweet'], test_data['class'])
print("Test data accuracy:", test_data_accuracy)


Test data accuracy: 0.68275


In [22]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(3,5)
    )),
    ("clf", LinearSVC(
        class_weight=svoriai,
    ))
])
svm_pipeline.fit(df['tweet'], df['class'])
test_accuracy = svm_pipeline.score(test_data['tweet'], test_data['class'])
print("Training accuracy:", test_accuracy)


Training accuracy: 0.66925


In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

model_name = "Hate-speech-CNERG/dehatebert-mono-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2 )
model.to(device)
model.eval()


texts = df["tweet"].astype(str).tolist()

label_map = {
    0: 1, 
    1: 0
}


def classify_batch(text_list, batch_size=64):
    predictions = []

    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]


        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=96,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            logits = model(**enc).logits
            preds = torch.argmax(logits, dim=1).cpu().tolist()


        preds = [label_map.get(p, 1) for p in preds]
        predictions.extend(preds)

    return predictions


test_accuracy = accuracy_score(test_data["class"], classify_batch(test_data["tweet"].astype(str).tolist(), batch_size=64))
print("Test accuracy:", test_accuracy)

Device: mps
Test accuracy: 0.58225
