In [37]:
#: Preprocessing corpus

import re
import string
from sklearn.base import BaseEstimator, TransformerMixin

# punctuation table
table = str.maketrans('', '', string.punctuation)

class Preprocessor(BaseEstimator, TransformerMixin):

    def remove_user_names(self, s):
        return re.sub('@[^\s]+','',s)

    def remove_numbers(self, s):
        return re.sub('[0-9]','',s)

    def remove_punctuation(self, s):
        res = [w.translate(table) for w in s.split()]
        return " ".join(res)

    def n_stemmer(self, s, n):
        if(n>0):
            res = [x[:n] for x in s.split()]
            return " ".join(res)
        raise Exception("n must be a positive integer!")

    def pre_process(self, s, n=10):
        return self.n_stemmer(self.remove_punctuation(self.remove_numbers(self.remove_user_names(s.lower()))), n)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: self.pre_process(x))

    def __init__(self):
        return

In [38]:
#: Try different models
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

stop_words_tr = pd.read_csv("https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words", header=None)
stop_words_tr = stop_words_tr[0].to_numpy()

def score_model(X, y, X_test, y_test, estimator, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)

    model = Pipeline([
        ('preprocess', Preprocessor()),
        ('counts', CountVectorizer(min_df=5, max_df=0.99, stop_words = frozenset(stop_words_tr))),
        ('tf_idf', TfidfTransformer()),
        ('estimator', estimator)
    ])

    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)

    expected  = y
    predicted = model.predict(X)

    expected_test = LabelEncoder().fit_transform(y_test)
    predicted_test = model.predict(X_test)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("\n- {}:\n\nTrain:\n{}\nTest:\n{}".format(estimator.__class__.__name__, \
        classification_report(expected, predicted), classification_report(expected_test, predicted_test)))

In [39]:
#: load data
data = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')
data.head()

Unnamed: 0,id,timestamp,text,label
0,973568937593065472,1520952977415,@USER06095 Hırsız demişken Tuncay sizin şu 1.2...,grp
1,973568937723035648,1520952977446,Ne bileyim sen hastayım deyince bende veterine...,ind
2,973568937911873536,1520952977491,Akşam eve gittiğimizde yorgunluğuma iyi gelece...,grp
3,973568939925090304,1520952977971,Kook’un sesini 18381 kez dinledikten sonra eğe...,prof
4,973568940667539457,1520952978148,@USER05270 @USER04816 o macta adam 6 7 tane ne...,grp


In [40]:
#: create labels
data['label_sinkaf'] = data['label'] != 'non'

data['label_sinkaf'].value_counts()
# 6.8K offensive - kufurlu yorum
# 28K  non-offensive - kufursuz yorum

False    28439
True      6845
Name: label_sinkaf, dtype: int64

In [35]:
#: split the data & find best model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_sinkaf'])
 
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


models = [
    SVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300), MultinomialNB()
]

for model in models:
    score_model(X_train, y_train, X_test, y_test, model)

SVC:
	 Train:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     21294
           1       0.00      0.00      0.00      5169

    accuracy                           0.80     26463
   macro avg       0.40      0.50      0.45     26463
weighted avg       0.65      0.80      0.72     26463
, 
	Test:
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      7145
           1       0.00      0.00      0.00      1676

    accuracy                           0.81      8821
   macro avg       0.40      0.50      0.45      8821
weighted avg       0.66      0.81      0.72      8821

LinearSVC:
	 Train:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     21294
           1       0.94      0.66      0.78      5169

    accuracy                           0.93     26463
   macro avg       0.93      0.83      0.87     26463
weighted avg       0.93      0.

KeyboardInterrupt: 

In [78]:
# create vectorizer (min 5 words, turkish stop words removed)
# vectorizer = CountVectorizer(min_df=5, max_df=0.99, stop_words=frozenset(stop_words_tr))
vectorizer = TfidfVectorizer(min_df=5, max_df=0.99, stop_words=frozenset(stop_words_tr))

# UMAP embeddings to see the data in 2D
# Data'yi iki boyutta gormek icin UMAP
X = vectorizer.fit_transform(data['text'])
y = data['label_sinkaf']

embedding = umap.UMAP(n_components=2, metric='hellinger').fit(X)


# For interactive plotting use
f = umapplot.interactive(embedding, labels=y, hover_data=hover_df, point_size=3)

show(f)

In [80]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [81]:
#undersampling ile %88 train, %75 test basarisi
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.930
Test acc:	0.849
              precision    recall  f1-score   support

       False       0.94      0.97      0.96     21322
        True       0.87      0.75      0.81      5141

    accuracy                           0.93     26463
   macro avg       0.91      0.86      0.88     26463
weighted avg       0.93      0.93      0.93     26463

              precision    recall  f1-score   support

       False       0.88      0.94      0.91      7117
        True       0.66      0.45      0.53      1704

    accuracy                           0.85      8821
   macro avg       0.77      0.70      0.72      8821
weighted avg       0.84      0.85      0.84      8821



In [9]:
# Tum veriyi kullanma
X = data['text']
y = data['label_sinkaf']
X_train, X_test, y_train, y_test = train_test_split(X,y)

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

13266


In [10]:
# az olan degerlerden veri uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

y_train.value_counts()

True     21342
False    21342
Name: label_sinkaf, dtype: int64

In [11]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [12]:
# Az olan orneklerin oversamplingi basarisiz
# BoW ile uretilen vektorler sparse oldugu icin, SMOTE tarzi teknikler
# ile veri uretimi basarisiz sonuclanmaktadir
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.713
Test acc:	0.806
              precision    recall  f1-score   support

       False       0.66      0.88      0.75     21342
        True       0.82      0.55      0.66     21342

    accuracy                           0.71     42684
   macro avg       0.74      0.71      0.71     42684
weighted avg       0.74      0.71      0.71     42684

              precision    recall  f1-score   support

       False       0.92      0.84      0.87      7097
        True       0.50      0.69      0.58      1724

    accuracy                           0.81      8821
   macro avg       0.71      0.76      0.73      8821
weighted avg       0.84      0.81      0.82      8821



In [13]:
# Final model olusturulmasi
# Aza orneklenen veri kullanilarak model olusturulmasi
X_false = data[data['label_sinkaf'] == False].sample(6845)["text"]
X_true = data[data['label_sinkaf'] == True]["text"]
X_undersampled = pd.concat([X_false, X_true])
y_undersampled = np.concatenate([np.zeros((6845,1)), np.ones((6845,1))])
y_undersampled = y_undersampled != 0 

vectorizer = CountVectorizer(min_df=2, max_df=0.99, stop_words=frozenset(stop_words_tr))
X_train = vectorizer.fit_transform(X_undersampled)

In [14]:
clf = MultinomialNB()
clf.fit(X_train, y_undersampled)

print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_undersampled)))
print(classification_report(y_undersampled, clf.predict(X_train)))

Train acc:	0.870
              precision    recall  f1-score   support

       False       0.87      0.87      0.87      6845
        True       0.87      0.87      0.87      6845

    accuracy                           0.87     13690
   macro avg       0.87      0.87      0.87     13690
weighted avg       0.87      0.87      0.87     13690



In [15]:
# Offensive? - Kufur mu?

test = ["cok iyi", 
        "bi git", 
        "bi siktir git", 
        "bi defol",
        "mukemmel bir insansin"]

test_processed = vectorizer.transform([pre_process(sentence, 5) for sentence in test])
clf.predict(test_processed)

array([False,  True,  True,  True, False])

In [16]:
np.set_printoptions(suppress=True)

clf.predict_proba(test_processed)[:,1]

array([0.25272762, 0.74295274, 0.99612049, 0.81632094, 0.46115515])

In [17]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(vectorizer, "sinkaf/data/vectorizer.joblib")
joblib.dump(clf, "sinkaf/data/clf.joblib")


['sinkaf/data/clf.joblib']