In [1]:
import numpy as np
import pandas as pd
import re
import string

from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')

# punctuation table
table = str.maketrans('', '', string.punctuation)

def remove_user_names(s):
    return re.sub('@[^\s]+','',s)

def remove_numbers(s):
    return re.sub('[0-9]','',s)

def remove_punctuation(s):
    res = [w.translate(table) for w in s.split()]
    return " ".join(res)

def n_stemmer(s, n):
    if(n>0):
        res = [x[:n] for x in s.split()]
        return " ".join(res)
    raise Exception("n must be a positive integer!")

def pre_process(s, n=5):
    return n_stemmer(remove_punctuation(remove_numbers(remove_user_names(s.lower()))), n)

data['text'] = data['text'].apply(lambda x: pre_process(x, 5))

In [3]:
# create labels vector for training
# modeli egitmek icin siniflandirmalari olustur
data['label_sinkaf'] = data['label'] != 'non'

In [4]:
# 6.8K offensive - kufurlu yorum
# 28K  non-offensive - kufursuz yorum
data['label_sinkaf'].value_counts()

False    28439
True      6845
Name: label_sinkaf, dtype: int64

In [5]:
stop_words_tr = pd.read_csv("https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words", header=None)

stop_words_tr = stop_words_tr[0].to_numpy()

In [6]:
# Kufurlu veriyi aza ornekleme
# Undersampling non offensive data
X_false = data[data['label_sinkaf'] == False].sample(6845)["text"]
X_true = data[data['label_sinkaf'] == True]["text"]
X_undersampled = pd.concat([X_false, X_true])
y_undersampled = np.concatenate([np.zeros((6845,1)), np.ones((6845,1))])


X_train, X_test, y_train, y_test = train_test_split(X_undersampled, y_undersampled)

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

8379


In [7]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [8]:
#undersampling ile %88 train, %75 test basarisi
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.876
Test acc:	0.749
              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88      5124
         1.0       0.88      0.88      0.88      5143

    accuracy                           0.88     10267
   macro avg       0.88      0.88      0.88     10267
weighted avg       0.88      0.88      0.88     10267

              precision    recall  f1-score   support

         0.0       0.76      0.74      0.75      1721
         1.0       0.74      0.76      0.75      1702

    accuracy                           0.75      3423
   macro avg       0.75      0.75      0.75      3423
weighted avg       0.75      0.75      0.75      3423



In [9]:
# Tum veriyi kullanma
X = data['text']
y = data['label_sinkaf']
X_train, X_test, y_train, y_test = train_test_split(X,y)

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

13266


In [10]:
# az olan degerlerden veri uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

y_train.value_counts()

True     21342
False    21342
Name: label_sinkaf, dtype: int64

In [11]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [12]:
# Az olan orneklerin oversamplingi basarisiz
# BoW ile uretilen vektorler sparse oldugu icin, SMOTE tarzi teknikler
# ile veri uretimi basarisiz sonuclanmaktadir
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.713
Test acc:	0.806
              precision    recall  f1-score   support

       False       0.66      0.88      0.75     21342
        True       0.82      0.55      0.66     21342

    accuracy                           0.71     42684
   macro avg       0.74      0.71      0.71     42684
weighted avg       0.74      0.71      0.71     42684

              precision    recall  f1-score   support

       False       0.92      0.84      0.87      7097
        True       0.50      0.69      0.58      1724

    accuracy                           0.81      8821
   macro avg       0.71      0.76      0.73      8821
weighted avg       0.84      0.81      0.82      8821



In [13]:
# Final model olusturulmasi
# Aza orneklenen veri kullanilarak model olusturulmasi
X_false = data[data['label_sinkaf'] == False].sample(6845)["text"]
X_true = data[data['label_sinkaf'] == True]["text"]
X_undersampled = pd.concat([X_false, X_true])
y_undersampled = np.concatenate([np.zeros((6845,1)), np.ones((6845,1))])
y_undersampled = y_undersampled != 0 

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))
X_train = vectorizer.fit_transform(X_undersampled)

In [14]:
clf = MultinomialNB()
clf.fit(X_train, y_undersampled)

print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_undersampled)))
print(classification_report(y_undersampled, clf.predict(X_train)))

Train acc:	0.870
              precision    recall  f1-score   support

       False       0.87      0.87      0.87      6845
        True       0.87      0.87      0.87      6845

    accuracy                           0.87     13690
   macro avg       0.87      0.87      0.87     13690
weighted avg       0.87      0.87      0.87     13690



In [15]:
# Offensive? - Kufur mu?

test = ["cok iyi", 
        "bi git", 
        "bi siktir git", 
        "bi defol",
        "mukemmel bir insansin"]

test_processed = vectorizer.transform([pre_process(sentence, 5) for sentence in test])
clf.predict(test_processed)

array([False,  True,  True,  True, False])

In [16]:
np.set_printoptions(suppress=True)

clf.predict_proba(test_processed)[:,1]

array([0.25272762, 0.74295274, 0.99612049, 0.81632094, 0.46115515])

In [17]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(vectorizer, "sinkaf/data/vectorizer.joblib")
joblib.dump(clf, "sinkaf/data/clf.joblib")


['sinkaf/data/clf.joblib']