<a href="https://colab.research.google.com/github/MaeSantos/CCMACLRL_EXERCISES_COM232ML/blob/main/Exercise4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import pandas as pd
import numpy as np
import re
import nltk
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [50]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [51]:
splits = {'train': 'unique_train_dataset.csv', 'validation': 'unique_validation_dataset.csv', 'test': 'unique_test_dataset.csv'}

In [52]:
df_train = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["train"])
df_validation = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["validation"])
df_test = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["test"])

print("Train size:", df_train.shape)
print("Validation size:", df_validation.shape)
print("Test size:", df_test.shape)

Train size: (21773, 2)
Validation size: (2800, 2)
Test size: (2810, 2)


In [53]:
df_train.head(10)

Unnamed: 0,text,label
0,Presidential candidate Mar Roxas implies that ...,1
1,Parang may mali na sumunod ang patalastas ng N...,1
2,Bet ko. Pula Ang Kulay Ng Posas,1
3,[USERNAME] kakampink,0
4,Bakit parang tahimik ang mga PINK about Doc Wi...,1
5,"""Ang sinungaling sa umpisa ay sinungaling hang...",1
6,Leni Kiko,0
7,Nahiya si Binay sa Makati kaya dito na lang sa...,1
8,Another reminderHalalan,0
9,[USERNAME] Maybe because VP Leni Sen Kiko and ...,0


In [54]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21773 entries, 0 to 21772
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    21773 non-null  object
 1   label   21773 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 340.3+ KB


In [55]:
df_train.isna().sum()

Unnamed: 0,0
text,0
label,0


In [56]:
df_train.duplicated().sum()

np.int64(0)

In [57]:
df_train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,10994
0,10779


In [58]:
df_train.drop_duplicates(inplace=True)
df_validation.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

In [59]:
df_train.dropna(inplace=True)
df_validation.dropna(inplace=True)
df_test.dropna(inplace=True)

In [60]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(tokens)

df_train['clean_text'] = df_train['text'].apply(preprocess_text)
df_validation['clean_text'] = df_validation['text'].apply(preprocess_text)
df_test['clean_text'] = df_test['text'].apply(preprocess_text)

In [61]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['clean_text'])
X_val = vectorizer.transform(df_validation['clean_text'])
X_test = vectorizer.transform(df_test['clean_text'])

y_train = df_train['label']
y_val = df_validation['label']
y_test = df_test['label']


In [62]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [63]:
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    print(f"{dataset_name} Result")
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    print("\nAccuracy:", accuracy_score(y, y_pred))
    print("Precision:", precision_score(y, y_pred))
    print("Recall:", recall_score(y, y_pred))
    print("F1 Score:", f1_score(y, y_pred))

In [64]:
evaluate_model(nb_model, X_val, y_val, "Validation Set")

Validation Set Result
Confusion Matrix:
 [[1093  292]
 [ 171 1244]]

Accuracy: 0.8346428571428571
Precision: 0.8098958333333334
Recall: 0.8791519434628975
F1 Score: 0.8431040325313454


In [65]:
evaluate_model(nb_model, X_test, y_test, "Test Set")

Test Set Result
Confusion Matrix:
 [[1113  299]
 [ 163 1235]]

Accuracy: 0.8355871886120997
Precision: 0.8050847457627118
Recall: 0.8834048640915594
F1 Score: 0.8424283765347885


In [66]:
def predict_sentence(sentence):
    clean = preprocess_text(sentence)
    vec = vectorizer.transform([clean])
    pred = nb_model.predict(vec)[0]
    label = "Hate Speech" if pred == 1 else "Non-Hate Speech"
    return label

In [67]:
df_train.head(10)

Unnamed: 0,text,label,clean_text
0,Presidential candidate Mar Roxas implies that ...,1,presidential candidate mar roxas implies govt ...
1,Parang may mali na sumunod ang patalastas ng N...,1,parang may mali na sumunod ang patalastas ng n...
2,Bet ko. Pula Ang Kulay Ng Posas,1,bet ko pula ang kulay ng posas
3,[USERNAME] kakampink,0,username kakampink
4,Bakit parang tahimik ang mga PINK about Doc Wi...,1,bakit parang tahimik ang mga pink doc willie o...
5,"""Ang sinungaling sa umpisa ay sinungaling hang...",1,ang sinungaling sa umpisa ay sinungaling hangg...
6,Leni Kiko,0,leni kiko
7,Nahiya si Binay sa Makati kaya dito na lang sa...,1,nahiya si binay sa makati kaya dito na lang sa...
8,Another reminderHalalan,0,another reminderhalalan
9,[USERNAME] Maybe because VP Leni Sen Kiko and ...,0,username maybe vp leni sen kiko whole team buo...


In [68]:
test_text = "Panget"
print("\nResult:")
print(test_text, "->", predict_sentence(test_text))


Result:
Panget -> Hate Speech


In [69]:
submission_df = pd.DataFrame({'text': df_test['text'], 'label': nb_model.predict(X_test)})

submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created.")
submission_df.head()


Submission file 'submission.csv' has been created.


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,1
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
