In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [122]:
df = pd.read_csv('data/hatespeech_dataset.csv')

In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53745 entries, 0 to 53744
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   free_text  53743 non-null  object
 1   label_id   53745 non-null  int64 
 2   dataset    53745 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [124]:
null = df[df['free_text'].isnull()]

In [125]:
null

Unnamed: 0,free_text,label_id,dataset
10950,,1,vihsd
20880,,0,vihsd


In [126]:
df = df.dropna()

In [127]:
clean = df[df['label_id']==0]


In [128]:
offensive = df[df['label_id']==1]


In [129]:
hate = df[df['label_id']==2]


In [130]:
len(clean)

46237

In [131]:
clean.head()

Unnamed: 0,free_text,label_id,dataset
0,Em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...,0,vihsd
2,ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†,0,vihsd
4,T·ª´ l√Ω thuy·∫øt ƒë·∫øn th·ª±c h√†nh l√† c·∫£ 1 c√¢u chuy·ªán ...,0,vihsd
5,Coronavirus is manmade,0,vihsd
6,ƒê·ªë ch√∫ng m nh·∫≠n ra ai,0,vihsd


In [132]:
len(offensive)

3283

In [133]:
offensive.head()

Unnamed: 0,free_text,label_id,dataset
7,L√∫p l√∫p nh∆∞ ch√≥ .,1,vihsd
18,D·∫°y b∆°i cho c√°. B∆°i th√¨ ƒë∆∞∆°ng nhi√™n n√≥ b∆°i ƒë∆∞·ª£...,1,vihsd
38,√ù th·ª©c c√≤n √≠t h∆°n c·∫£ s·ªë ti·ªÅn trong t√∫i t,1,vihsd
47,xxx video üîû,1,vihsd
58,ƒê·∫•u kh·∫©u - Chim l·ª£n üëç,1,vihsd


In [134]:
len(hate)

4223

In [135]:
hate.head()

Unnamed: 0,free_text,label_id,dataset
1,ƒê√∫ng l√† b·ªçn m·∫Øt h√≠p l√≤ xo th·ª•t :))) b√™n vi·ªát n...,2,vihsd
3,C√îN ƒê·ªí C·ª§C S√öC V√î NH√ÇN T√çNH ƒê·ªÄ NGHI VN. NH√Ä N∆Ø...,2,vihsd
9,"Th·∫ø m√† m√¨nh n√≥i m·∫•y th·∫±ng b·∫Øc k√¨, b·ªçn ƒë√≥ l·∫°i b...",2,vihsd
16,Lo·∫°i n√†y cho d·ª±a c·ªôt th√¥i ch·ª© ƒë·ªô th·∫ø n√†o,2,vihsd
19,·ª¶a ch·ª© b·ªØa xin l·ªói kh√≥c t·∫ø om x√≤m m√† gi·ªù ch·ª≠i ...,2,vihsd


In [136]:
df.iat[2,0]

'ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†'

In [137]:
def lower_text(text:str):
    return text.lower()

In [138]:
import re

In [139]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [140]:
def preprocess(text:str):
    text = lower_text(text)
    text = remove_emoji(text)
    return text.strip()

In [141]:
preprocess('m fan ng n√® ‚ù§Ô∏è reaction')

'm fan ng n√®  reaction'

In [142]:
df['free_text'] = df['free_text'].apply(lambda x: preprocess(x))

In [145]:
df.to_csv('data/clean_hatespeech.csv')

In [146]:
clean = df[df['label_id']==0]
offensive = df[df['label_id']==1]
hate = df[df['label_id']==2]


In [147]:
df = df.sample(frac=1, random_state=42)

In [148]:
train_df, _test_df = train_test_split(df, test_size=0.4, random_state=42)


In [149]:
len(train_df)

32245

In [150]:
len(_test_df)

21498

In [151]:
test_df, val_df = train_test_split(_test_df, test_size=0.5, random_state=42)


In [152]:
len(test_df)

10749

In [153]:
len(val_df)

10749

In [154]:
train_clean = train_df[train_df['label_id']==0]
train_offensive = train_df[train_df['label_id']==1]
train_hate = train_df[train_df['label_id']==2]


In [155]:
len(train_clean)

27708

In [156]:
len(train_offensive)

1969

In [157]:
len(train_hate)

2568

In [211]:
from sklearn.feature_extraction.text import TfidfTransformer


In [228]:
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()



In [229]:
train_df

Unnamed: 0,free_text,label_id,dataset
11453,@linda bui l√™n m·ªü clip c√¥ minh hi·∫øu n·∫•u v√†ng ƒë...,0,vihsd
11299,"no no, ch·ªâ c√≥ b·∫Øc k√¨ th√¥i nh√©, nam k k√¨",0,vihsd
48561,"""h√†ng order h·ªôp h·ªôp c√≤n h·ªôp l√™_th·∫£o""",0,vlsp
6302,h∆∞∆°ng him c√πng c·∫£nh ng·ªô ph√πng kh·∫Øc ho√†ng linh,0,vihsd
43423,"""l·ªõp ti·∫øng h√†n c√≤n nh·∫≠n h·ªçc_sinh kh√¥ng""",0,vlsp
...,...,...,...
1544,t·∫°i ch·ªã nhung dung dƒÉng dung d·∫ª m√† m√¨nh ph·∫£i ƒë...,0,vihsd
28563,c·∫£m ƒë·ªông r·ªõt n∆∞·ªõc m·∫Øt:v,0,vihsd
53342,"""vi·∫øt v·ªÅ nha trang n√≥i nhi·ªÅu v·ªÅ phan_thi·∫øt""",0,vlsp
40095,"""th·ª≠ l·∫ßn b·∫≠t ng·ª≠a""",0,vlsp


In [233]:
X_train = train_df.free_text

In [234]:
X_train.shape

(32245,)

In [235]:
y_train = train_df.label_id

In [236]:
y_train.values.shape

(32245,)

In [238]:
X_test = test_df.free_text

In [239]:
y_test = test_df.label_id

In [240]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X_train)

In [241]:
X_counts.shape

(32245, 25303)

In [242]:
nb_clf = MultinomialNB()
nb_clf.fit(X_counts, y_train.values)

In [243]:
X_test_counts = count_vect.transform(X_test)

In [244]:
X_test_counts.shape

(10749, 25303)

In [245]:
y_pred = nb_clf.predict(X_test_counts)

In [246]:
y_pred.shape

(10749,)

In [247]:
y_pred.shape

(10749,)

In [248]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [249]:
poly_accuracy = accuracy_score(y_test, y_pred)
poly_f1 = f1_score(y_test, y_pred, average='weighted')
print('Mul_Bayes acc): ', "%.2f" % (poly_accuracy*100))
print('Mul_Bayes f1): ', "%.2f" % (poly_f1*100))

Mul_Bayes acc):  88.35
Mul_Bayes f1):  86.64


In [201]:
train_df.to_csv('train_hatespeech.csv', index=False)

In [202]:
test_df.to_csv('test_hatespeech.csv', index=False)

In [203]:
val_df.to_csv('val_hatespeech.csv', index=False)


In [250]:
import joblib

In [251]:
model_file = 'bayes.model'
joblib.dump(nb_clf, model_file)

['bayes.model']

In [252]:
loaded_model = joblib.load(model_file)

In [253]:
text = 'ƒë·ªãt c√°i l·ªìn m·∫π m√†y'

In [254]:
text_vectorized = count_vect.transform([text])

In [255]:
loaded_model.predict(text_vectorized)

array([2])

In [256]:
vectorizer_filename = "count_vectorizer.joblib"
joblib.dump(count_vect, vectorizer_filename)

['count_vectorizer.joblib']

In [257]:
train_hate.head(10)

Unnamed: 0,free_text,label_id,dataset
22499,con me gia mat day vo van hoa,2,vihsd
49,anh ba l√°o nh√°o l√† t√∫ xi√™n gi·ªù,2,vihsd
8921,may tr∆∞·ªõc ƒëay ninh no chui li√™n oanh m...,2,vihsd
10567,gi·ªù m·ªõi nghe ....tr·ªùi ƒë·∫•t ∆°i c√°i mi·ªáng n√≥ d∆° b...,2,vihsd
14092,th·∫±ng trung c√≥ con *** to l·∫Øm,2,vihsd
7177,@tiktok official h√πa theo l√† ch√≥ v·∫≠y m√†y c≈©ng ...,2,vihsd
10510,"h·ªçc ·ªü tr∆∞·ªùng th·∫ßy, c√¥ gi√°o d·∫°y hay l·∫Øm y√™u t·ªï ...",2,vihsd
26398,"t·ª´ ng√†y chung l√™n thay nhanh , xong chung cho ...",2,vihsd
5180,khung qua linh oi t√¥i sau ma khung nua,2,vihsd
2770,b√°o ch√≠ g√¨ m√† n√≥i s·∫°o l·ªô li·ªÖu. n·∫£n,2,vihsd
