In [1]:
import joblib
import numpy as np
import pandas as pd
import re
import string

from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')

# punctuation table
table = str.maketrans('', '', string.punctuation)

def remove_user_names(s):
    return re.sub('@[^\s]+','',s)

def remove_numbers(s):
    return re.sub('[0-9]','',s)

def remove_punctuation(s):
    res = [w.translate(table) for w in s.split()]
    return " ".join(res)

def n_stemmer(s, n):
    if(n>0):
        res = [x[:n] for x in s.split()]
        return " ".join(res)
    raise Exception("n must be a positive integer!")

def pre_process(s, n=5):
    return n_stemmer(remove_punctuation(remove_numbers(remove_user_names(s.lower()))), n)

In [3]:
data['text'] = data['text'].apply(lambda x: pre_process(x, 5))

In [4]:
# create labels vector for training
# modeli egitmek icin siniflandirmalari olustur
data['label_sinkaf'] = data['label'] != 'non'

In [5]:
# 6.8K offensive - kufurlu yorum
# 28K  non-offensive - kufursuz yorum
data['label_sinkaf'].value_counts()

False    28439
True      6845
Name: label_sinkaf, dtype: int64

In [6]:
stop_words_tr = pd.read_csv("https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words", header=None)

stop_words_tr = stop_words_tr[0].to_numpy()

In [7]:
# Kufurlu veriyi aza ornekleme
# Undersampling non offensive data
X_false = data[data['label_sinkaf'] == False].sample(6845)["text"]
X_true = data[data['label_sinkaf'] == True]["text"]
X_undersampled = pd.concat([X_false, X_true])
y_undersampled = np.concatenate([np.zeros((6845,1)), np.ones((6845,1))])


X_train, X_test, y_train, y_test = train_test_split(X_undersampled, y_undersampled)

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

8408


In [8]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [9]:
#undersampling ile %88 train, %75 test basarisi
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

print(confusion_matrix(y_test, clf.predict(X_test)))

Train acc:	0.874
Test acc:	0.747
              precision    recall  f1-score   support

         0.0       0.87      0.88      0.87      5149
         1.0       0.87      0.87      0.87      5118

    accuracy                           0.87     10267
   macro avg       0.87      0.87      0.87     10267
weighted avg       0.87      0.87      0.87     10267

              precision    recall  f1-score   support

         0.0       0.75      0.74      0.74      1696
         1.0       0.74      0.76      0.75      1727

    accuracy                           0.75      3423
   macro avg       0.75      0.75      0.75      3423
weighted avg       0.75      0.75      0.75      3423

[[1247  449]
 [ 416 1311]]


In [10]:
# Tum veriyi kullanma
X = data['text']
y = data['label_sinkaf']
X_train, X_test, y_train, y_test = train_test_split(X,y)

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

13232


In [11]:
# az olan degerlerden veri uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

y_train.value_counts()

True     21320
False    21320
Name: label_sinkaf, dtype: int64

In [12]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [13]:
# Az olan orneklerin oversamplingi basarisiz
# BoW ile uretilen vektorler sparse oldugu icin, SMOTE tarzi teknikler
# ile veri uretimi basarisiz sonuclanmaktadir
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

print(confusion_matrix(y_test, clf.predict(X_test)))

Train acc:	0.705
Test acc:	0.807
              precision    recall  f1-score   support

       False       0.65      0.87      0.75     21320
        True       0.81      0.54      0.65     21320

    accuracy                           0.71     42640
   macro avg       0.73      0.71      0.70     42640
weighted avg       0.73      0.71      0.70     42640

              precision    recall  f1-score   support

       False       0.91      0.84      0.88      7119
        True       0.50      0.67      0.57      1702

    accuracy                           0.81      8821
   macro avg       0.71      0.76      0.72      8821
weighted avg       0.83      0.81      0.82      8821

[[5980 1139]
 [ 560 1142]]


In [14]:
# Final model olusturulmasi
# Aza orneklenen veri kullanilarak model olusturulmasi
X_false = data[data['label_sinkaf'] == False].sample(6845)["text"]
X_true = data[data['label_sinkaf'] == True]["text"]
X_under = pd.concat([X_false, X_true])
y_under = np.concatenate([np.zeros((6845,1)), np.ones((6845,1))])
y_under = y_undersampled != 0 

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))
X_train = vectorizer.fit_transform(X_under)

In [15]:
clf = MultinomialNB()
clf.fit(X_train, y_under)

print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_under)))
print(classification_report(y_under, clf.predict(X_train)))

Train acc:	0.866
              precision    recall  f1-score   support

       False       0.87      0.87      0.87      6845
        True       0.87      0.87      0.87      6845

    accuracy                           0.87     13690
   macro avg       0.87      0.87      0.87     13690
weighted avg       0.87      0.87      0.87     13690



In [16]:
# Offensive? - Kufur mu?

test = ["cok iyi", 
        "bi git", 
        "bi siktir git", 
        "bi defol",
        "mukemmel bir insansin"]

test_processed = vectorizer.transform([pre_process(s, 5) for s in test])
clf.predict(test_processed)

array([False,  True,  True,  True, False])

In [17]:
np.set_printoptions(suppress=True)

clf.predict_proba(test_processed)[:,1]

array([0.2612792 , 0.81272191, 0.99488194, 0.87251235, 0.29236189])

In [18]:
#dump it (higher), dump it (higher!)
joblib.dump(vectorizer, "sinkaf/data/vectorizer.joblib")
joblib.dump(clf, "sinkaf/data/clf.joblib")


['sinkaf/data/clf.joblib']