In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
tf.random.set_seed(123)
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as Pipeline
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('labeled_data.csv')
df = df[df['class'] != 1]
df.head()
len(df)
df['class'] = df['class'].replace(2, 1)
df['class'].unique()
df.head()

splits = {'train': 'data/train-00000-of-00001.parquet'}
test_data = pd.read_parquet("hf://datasets/abdulrub/hate_speech_dataset/" + splits["train"])
test_data.columns=["tweet","class"]
test_data["class"] = 1 - test_data["class"]


In [None]:
def preprocess_text(text):
    all_stopwords = stopwords.words('english')
    svarus_tekstas=[]
    for tweetas in text:
        tweet=str(tweetas).lower()
        tweet=re.sub(r"@[A-Za-z0-9_]+", " ", tweetas)
        tweet=re.sub('RT', ' ', tweet)
        tweet=re.sub(r"https?://[A-Za-z0-9./]+", " ", tweet)
        tweet= re.sub(r"https?", " ", tweet)
        tweet=re.sub(r"[^a-zA-Z]", " ", tweet) 
        tweet=tweet.split()
        ps=PorterStemmer()
        tweet=[ps.stem(word) for word in tweet if not word in set(all_stopwords) if len(word)>2]
        tweet=' '.join(tweet)
        svarus_tekstas.append(tweet)
    return svarus_tekstas

df = df[['class', 'tweet']]
df.dropna(inplace=True)
df['tweet']=preprocess_text(df['tweet'])
df.head()
df = df.sample(frac=1,random_state=123).reset_index(drop=True)
print(df['class'].value_counts())

test_data.dropna(inplace=True)
test_data["tweet"]=preprocess_text(test_data["tweet"])
test_data= test_data.sample(n=1000,random_state=123).reset_index(drop=True)
print(test_data['class'].value_counts())


In [None]:
max_feature=20000 
max_text_length=50 
d=300
x_tokenizer = Tokenizer(num_words=max_feature) 
x_tokenizer.fit_on_texts(df['tweet']) 
X = x_tokenizer.texts_to_sequences(df['tweet']) 
X = pad_sequences(X, maxlen=max_text_length, padding='post', truncating='post') 
print(X)

nlp = spacy.load("en_core_web_md")
index2word = {v: k for k, v in x_tokenizer.word_index.items() if v < max_feature} 
X_embedded = []
for sak in X:
    vektoriai = []
    for zodis in sak:
        if zodis == 0:
            vektoriai.append(np.zeros(d))
            continue
        word=index2word[zodis] 
        vektoriai.append(nlp(word).vector) 
    X_embedded.append(vektoriai)
X_embedded = np.array(X_embedded)
print(X_embedded)

y = df['class'].values

X_train, X_val, y_train, y_val = train_test_split(
    X_embedded, y, test_size=0.2, random_state=123, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

X_test = x_tokenizer.texts_to_sequences(test_data['tweet'])
X_test = pad_sequences(X_test, maxlen=max_text_length, padding='post', truncating='post')
X_test_embedded = []
for sak in X_test:
    vektoriai = []
    for zodis in sak:
        if zodis == 0:
            vektoriai.append(np.zeros(d))
            continue
        word = index2word.get(zodis, None)
        if word is None:
            vektoriai.append(np.zeros(d))   
        else:
            vektoriai.append(nlp(word).vector)
    X_test_embedded.append(vektoriai)

X_test_embedded = np.array(X_test_embedded)
y_test = test_data['class'].values

In [None]:
svoriai_modeliams= class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['class']),
    y=df['class']
)
svoriai_modeliams = dict(enumerate(svoriai_modeliams))
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict) 


In [None]:
model=Sequential()
model.add(Dropout(0.2, input_shape=(max_text_length, d)))
model.add(Conv1D(64,2,padding='valid',activation='relu'))
model.add(MaxPooling1D()) 
model.add(Conv1D(64,2,padding='valid',activation='relu'))
model.add(MaxPooling1D())

model.add(Conv1D(32,2,padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(32,2,padding='valid',activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))
model.summary()

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

history = model.fit(X_train,y_train, class_weight=class_weights_dict, validation_data=(X_val, y_val), 
                    epochs=100, callbacks=EarlyStopping(monitor='val_loss', patience=6,restore_best_weights=True), batch_size=64)

test_loss, test_acc = model.evaluate(X_test_embedded, y_test)
print("Test accuracy:", test_acc)

In [None]:


y_pred_probs = model.predict(X_test_embedded)
y_pred = np.argmax(y_pred_probs, axis=1)

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["hate speech", "neutral"])
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(cmap='Blues', ax=ax)
ax.set_xlabel("Prognozuota reikšmė")
ax.set_ylabel("Tikroji reikšmė")
ax.set_title("Klasifikavimo matrica")
plt.show()

In [None]:


svoriai=dict(zip(df['class'].unique(),svoriai_modeliams.values()))
lr = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,5))),
    ("clf", LogisticRegression(
        solver="liblinear",
        class_weight=svoriai,
        max_iter=2000
    ))
])
lr.fit(df['tweet'], df['class'])
test_data_accuracy = lr.score(test_data['tweet'], test_data['class'])
print("Test data accuracy:", test_data_accuracy)


In [None]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(3,5)
    )),
    ("clf", LinearSVC(
        class_weight=svoriai,
    ))
])
svm_pipeline.fit(df['tweet'], df['class'])
test_accuracy = svm_pipeline.score(test_data['tweet'], test_data['class'])
print("Testing accuracy:", test_accuracy)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nDevice:", device)

model_name = "bert-base-uncased" 

tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2 
model.to(device)
model.eval()


test_texts = test_data["tweet"].astype(str).tolist()
test_labels = test_data["class"].tolist()


def classify_batch(text_list, batch_size=32): 
    preds_all = []

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]

       
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=96,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            logits = model(**enc).logits
            preds = torch.argmax(logits, dim=1).cpu().tolist()

        preds_all.extend(preds)

    return preds_all


test_preds = classify_batch(test_texts)
test_acc = accuracy_score(test_labels, test_preds)
print("Test Accuracy:", test_acc)