<a href="https://colab.research.google.com/github/Hamxea/Bidirectional-Encoder-Representations-from-Transformers-Turkish-Text-Classification/blob/main/cmp711project_baseline_nlp_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding
import pandas as pd
from tensorflow.python.keras.layers import GRU, Bidirectional
from tensorflow.python.keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from google.colab import drive

import tensorflow as tf

In [2]:
# set environment as googledrive to folder "resource"
data_path =  "/dataset/"

try:
    drive.mount('/content/drive')
    data_path = "/content/drive/My Drive/dataset/"

except:
    print("You are not working in Colab at the moment :(")

Mounted at /content/drive


In [3]:
df = pd.read_csv(data_path + 'turkish_text_data.csv', sep=';', encoding='utf-8')

In [4]:
df.drop_duplicates(subset=['text'], keep='first', inplace=True)

df = df.sample(frac=1)
df.reset_index(drop=True, inplace=True)

print(df.groupby('category').count())
print(df.shape)

           text
category       
dunya       677
ekonomi     690
kultur      567
saglik      632
siyaset     690
spor        636
teknoloji   647
(4539, 2)


In [5]:
df.head(10)

Unnamed: 0,category,text
0,kultur,erzurum da kültür sanat erzurum devlet_tiyatro...
1,siyaset,chp li aygün olarak gitmedim ki chp tunceli mi...
2,teknoloji, mwc 2013  e damga vuran ürünler huawei den ...
3,saglik,diş çürüğü erken doğum ve düşüğe neden olabili...
4,spor,şanssızlık değil hesap hatası ! başkan_aysal a...
5,siyaset,ak_partili aksu nun hayatı belgesel oluyor ak_...
6,dunya,yahudilerin torunlarına vatandaşlık hakkı ispa...
7,dunya,afganistan  da 12 taliban öldürüldü afganista...
8,saglik,canlıdan nakil kadavranın 3 5 katı internation...
9,ekonomi,uçağa ayakta yolcu geliyor ingiliz ryanair hav...


### Clean the Train data, specifically remove punctuations

In [46]:
import string
import re

totalContentCleaned = []
punctDict = {}
for punct in string.punctuation:
    punctDict[punct] = None
transString = str.maketrans(punctDict)
# since we intent to remove any punctuation with ''
for sen in df['text']:
    
    cleanedString = re.sub('[^a-zA-Z]+ ', '', sen)
    
    p = cleanedString.translate(transString)
    totalContentCleaned.append(p)
    

df['text'] = totalContentCleaned

In [6]:
output_dim = df.category.unique().size

target = df['category'].values.tolist()
data = df['text'].values.tolist()

encoder = LabelEncoder()
encoder.fit(target)
encoded_target = encoder.transform(target)
# convert integers to one hot encoded vectors
y_train = np_utils.to_categorical(encoded_target)
X_train = data

num_words= 12000
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(data)
tokenizer.word_index
print("Total vocab size:", len(tokenizer.word_index))


Total vocab size: 107887


In [7]:
# The Tokenizer stores everything in the word_index during fit_on_texts. Then, when calling the texts_to_sequences method, only the top num_words are considered
tokenizer.index_word[12000]

X_train_tokens = tokenizer.texts_to_sequences(X_train)

X_train[800]
print(X_train_tokens[800])

[1228, 1773, 6544, 1, 2304, 6, 143, 69, 116, 7325, 6, 442, 35, 2304, 1, 6544, 6, 17, 1228, 6231, 4, 765, 1136, 73, 227, 1559, 16, 641, 511, 7, 11, 7648, 13, 4153, 604, 205, 15, 22, 7325, 17, 1228, 83, 2304, 1, 6544, 17, 1228, 6231, 4, 3115, 61, 4, 274, 6752, 2413, 9, 3406, 49, 990, 73, 227, 283, 1559, 2978, 125, 7325, 17, 1228, 6231, 758, 104, 465, 4, 2273, 2, 1228, 1773, 736, 8, 40, 12, 7283, 68, 1315, 561, 39, 1759, 1661, 6544, 17, 201, 1228, 6231, 1, 2304, 21, 32, 1228, 6231, 641, 511, 7, 11, 7648, 13, 4153, 604, 205, 379, 9344, 71, 7325, 17, 40, 781, 379, 22, 2304, 789, 135, 2, 4267, 4241, 1, 2420, 52, 64, 1559, 69, 7648, 118, 349, 7325, 11, 29, 107, 2, 379, 22, 7325, 162, 2266, 5, 565, 25, 2225, 733, 2103, 7, 1919, 8367, 604, 46, 3115, 3676, 52, 64, 1559, 118, 349, 336, 170, 30, 2, 575, 4501, 867, 2, 494, 251, 38, 1244, 9, 67, 1071, 187, 3054, 4, 1242, 1559, 3, 318, 825, 908, 86, 537, 1730, 22, 88, 1398, 623, 1267, 132, 263, 1244, 5, 9, 5, 403, 433, 157, 9, 67, 8342, 596, 174, 297

In [8]:
num_tokens = [len(tokens) for tokens in X_train_tokens]
num_tokens = np.array(num_tokens)

np.mean(num_tokens)
np.max(num_tokens)
np.argmax(num_tokens) # index gösteriyor

X_train[np.argmax(num_tokens)]

# max_tokens belirleme
max_tokens = int(np.mean(num_tokens) + (2 * np.std(num_tokens)))
max_tokens

print("What percentage of the total does max_token contain: %", int(100 * np.sum(num_tokens < max_tokens) / len(num_tokens)))

What percentage of the total does max_token contain: % 95


In [9]:
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_tokens)
X_train_pad.shape

np.array(X_train_tokens[800])
X_train_pad[800]

idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token!=0]
    text = ' '.join(words)
    return text

tokens_to_string([1,2,3,4])
X_train[800]
X_train_tokens[800]
tokens_to_string(X_train_tokens[800])

'halka arz vakıfbank ve ziraat \x92 te ama yok halkbank \x92 tan sonra ziraat ve vakıfbank \x92 ın halka arzı da gündeme alındı başbakan yardımcısı babacan \x93 fakat bunlar için çok acele en azından birkaç ay \x94 dedi halkbank ın halka ardından ziraat ve vakıfbank ın halka arzı da gündemde istanbul da düzenlenen financial times türkiye zirvesi ne katılan başbakan yardımcısı ali babacan sorular üzerine halkbank ın halka arzı sonrasında diğer kamu da olası bir halka arz süreci ile ilgili olarak önümüzde iki tane konu var bunlardan birisi vakıfbank ın ikinci halka arzı ve ziraat in ilk halka arzı fakat bunlar için çok acele en azından birkaç ay lazım piyasanın önce halkbank ın ilgili etmesi lazım dedi ziraat bankası nda bir danışmanlık firması ve çalışmalarının ifade eden babacan ama acele şu anda halkbank çok yeni bunun bir lazım dedi halkbank ta hazine de kalan yüzde 51 lik pay için stratejik satışın birkaç yıl gündemde olmayacağını ifade eden babacan şu anda dünyada böyle büyük bir s

In [25]:
def baseline_model():

    model = Sequential()
    embedding_size = 100

    model.add(Embedding(input_dim=num_words,
                        output_dim=embedding_size,
                        input_length=max_tokens,
                        name='embedding_layer'))

    model.add(Bidirectional(GRU(units=250, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(GRU(units=250)))
    model.add(Dense(output_dim, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [11]:
print(output_dim)

7


In [26]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=4, batch_size=128, verbose=1)
kfold = KFold(n_splits=2, shuffle=True)
results = cross_val_score(estimator, X_train_pad, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 59.48% (1.64%)


In [None]:
precision = cross_val_score(estimator, X_train_pad, y_train, cv=kfold, scoring='precision')
f1 = cross_val_score(estimator, X_train_pad, y_train, cv=kfold, scoring='f1')

In [None]:
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
print('Recall', np.mean(recall), recall)
print('Precision', np.mean(precision), precision)
print('F1', np.mean(f1), f1)

In [None]:
print("Accuracy: %.2f%% (%.2f%%)" % (recall.mean()*100, recall.std()*100))