In [38]:
# necessary imports
import re
import json
import numpy as np
import pandas as pd
from datasets import load_dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Embedding, SpatialDropout1D, BatchNormalization, Dropout

## Constants

In [39]:
VOCAB_SIZE = 16385  # Pre-defined vocabulary size from the tokenizer documentation

non_latin_letters = {'ç': 'c', 'ə': 'e', 'ı': 'i', 'ğ': 'g', 'ö': 'o', 'ş': 's', 'ü': 'u', 'ch': 'c', 'sh': 's',
                     'gh': 'g'}

common_words = [
    'men', 'sen', 'o', 'biz', 'siz', 'onlar', 'ne', 'kim', 'hara', 'niye', 'nece', 'hansi', 'ne vaxt', 'sonra',
    'eger', 'heqiqeten', 'lakin', 'cunki', 'bu', 'ki', 'butun', 've', 'ya', 'veya', 'amma', 'yoxsa', 'ancag', 'sadece',
    'qisa', 'uzun', 'kicik', 'boyuk', 'ora', 'bura', 'sag', 'sol', 'salam', 'bele', 'cox', 'az', 'e', 'bir', 'her'
]

suffixes = [
    'lar', 'ler', 'larin', 'lerin', 'mis', 'mak', 'mek', 'liq', 'luq',
    'acaq', 'eceq', 'ma', 'm', 'am', 'em', 'ar', 'er', 'araq', 'ereq', 'arak', 'erek', 'ca', 'ce', 'ci', 'cu', 'da',
    'de', 'dan', 'den', 'di', 'diq', 'dir', 'du', 'duq', 'dur', 'duk', 'dur', 'ib', 'ici', 'il', 'inci', 'uncu',
    'istan', 'is', 'in', 'la', 'le', 'las', 'les', 'liq', 'luq', 'luk', 'maq', 'mek', 'mis', 'mus', 'n', 'nci', 'ncu',
    's', 'ub', 'ucu', 'ul', 'ustan', 'us', 'ub', 'ucu', 'y',
    'cil', 'dar', 'der', 'an', 'en', 'gec', 'kar', 'kes', 'ken', 'la', 'le', 'las', 'les', 'lik', 't', 'ma', 'me',
    'nan', 'nen', 'ova', 'ov', 'sen', 'san', 'siniz', 'sul', 'sunas',
]

emojis = [
    '👍', '👎', '👌', '🙃', '😉', '❤️', '🖤', '💔', '💕', '💖', '💗', '💘', '💙', '💚', '💛', '💜', '💝', '💞', '💟',
    '💠', '🤗', '🤔', '🤣', '🤤', '🤥', '🤦', '🤧', '🤨', '🤩', '🤪', '🤫', '🤬', '🤭', '🤮', '🤯', '🤰', '🤱', '🤲',
    '🎉', '😡', '🥰'
]


## Preprocessing Text

In [40]:
def replace_non_latins(text: str) -> str:
    """
    Replace non-latin azerbaijani letters with their latin counterparts
    """
    pattern = re.compile('|'.join(map(re.escape, non_latin_letters.keys())))
    return pattern.sub(lambda m: non_latin_letters[m.group(0)], text)


def remove_punctuation(text: str) -> str:
    """
    Remove punctuation from the text
    """
    # Replace hyphens with spaces to avoid concatenation of words
    text = text.replace('-', ' ')
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    return text.translate(str.maketrans('', '', punctuation))


def remove_common_words(text: str) -> str:
    """
    Remove common words from the text
    """
    return ' '.join([word for word in text.split() if word not in common_words])


def preprocess_text(text: str) -> str:
    """
    Preprocess the text by converting it to lowercase, replacing non-latin letters with their latin counterparts,
    removing non-alphabetic characters and removing common words
    :param text: input text
    :return: preprocessed text
    """
    text = text.lower()
    text = replace_non_latins(text)
    text = remove_punctuation(text)
    text = remove_common_words(text)
    return text if text else None


def tokenize_text(text: str) -> np.ndarray:
    """
    Tokenize the text by removing grammatical suffixes, considering roots and lexical suffixes as separate tokens
    :param text: input text
    :return: array of tokens
    """
    tokens = []
    for word in text.split():
        morphemes = []  # Morphemes of the word
        for suffix in suffixes:
            if word.endswith(suffix):
                # Remove grammatical suffix
                word = word[:-len(suffix)]
        for emoji in emojis:
            if emoji in word:
                morphemes.append(emoji)
                word = word.replace(emoji, '')
        if word and word not in common_words:
            morphemes.append(word)  # Add root as a separate token

        # Reverse the order of morphemes to get the correct order
        morphemes = morphemes[::-1]
        tokens.extend(morphemes)

    return np.array(tokens)


## Tokenizer

In [41]:
class Tokenizer:
    def __init__(self):
        self.vocab_size = 0
        self.vocab = {}
        self.corpus = []

    def fit(self, texts):
        """
        Fit the tokenizer on the given texts
        :param texts: text corpus
        :type texts: iterable of strings
        :return: None
        """
        for text in texts:
            tokens = tokenize_text(text)
            self.corpus.append(tokens)
            for token in tokens:
                if token not in self.vocab.keys():
                    self.vocab[token] = self.vocab_size + 1
                    self.vocab_size += 1

    def transform(self, texts) -> list:
        """
        Transform the given texts to tokenized form
        :param texts: input texts
        :return: tokenized texts
        """
        tokenized_texts = []
        for text in texts:
            tokens = tokenize_text(text)
            tokenized_text = [self.vocab[token] for token in tokens if token in self.vocab.keys()]
            tokenized_texts.append(tokenized_text)
        return tokenized_texts

    def transform_single(self, text: str) -> np.ndarray:
        """
        Transform a single text to tokenized form
        :param text: input text
        :return: tokenized text
        """
        tokens = tokenize_text(text)
        tokenized_text = [self.vocab[token] for token in tokens if token in self.vocab.keys()]
        return np.array(tokenized_text)

    def token_counts(self):
        """
        Count the number of occurrences of each token in the corpus
        """
        counts = {token: 0 for token in self.vocab.keys()}
        for tokens in self.corpus:
            for token in tokens:
                try:
                    counts[token] += 1
                except KeyError:
                    pass  # Ignore tokens not in the vocabulary
        return counts

    def _reindex_vocab(self):
        """
        Reindex the vocabulary after removing tokens
        """
        new_vocab = {}
        for i, token in enumerate(self.vocab.keys()):
            new_vocab[token] = i + 1
        self.vocab = new_vocab

    def remove_rare_tokens(self, min_count: int):
        """
        Remove tokens with occurrences less than the given minimum count from the vocabulary
        :param min_count: the minimum count for the tokens
        :return: None
        """
        token_counts = self.token_counts()
        for token, count in token_counts.items():
            if count < min_count:
                del self.vocab[token]
                self.vocab_size -= 1
        self._reindex_vocab()

    def keep_top_k(self, k: int):
        """
        Keep only the top k tokens in the vocabulary
        :param k: number of tokens to keep
        :return: None
        """
        token_counts = self.token_counts()
        sorted_tokens = sorted(token_counts, key=token_counts.get, reverse=True)
        for token in sorted_tokens[k:]:
            del self.vocab[token]
            self.vocab_size -= 1
        self._reindex_vocab()

    def save(self, path: str):
        """
        Save the vocabulary to the given path
        """
        with open(path, 'w') as f:
            f.write(json.dumps(self.vocab))

    def load(self, path: str):
        """
        Load the vocabulary from the given path
        """
        with open(path, 'r') as f:
            self.vocab = json.loads(f.read())
            self.vocab_size = len(self.vocab)
            self._reindex_vocab()
        return self

## Load the dataset

In [42]:
dataset = load_dataset("hajili/azerbaijani_review_sentiment_classification")

In [43]:
train = dataset['train']
test = dataset['test']

In [44]:
train = pd.DataFrame(train).dropna()
test = pd.DataFrame(test).dropna()

In [45]:
train.head()

Unnamed: 0,content,score,upvotes
0,Çox qəşəy,5,0
1,Men niye nomre ile qeydiyatdan kece bilmirem.....,1,0
2,Salam. Mən yukluyə bilmirəm. Necə kömək edə bi...,5,0
3,cox gözəl,5,1
4,Xaiş edirem bu problemi hell edinde həftədə 2 ...,5,0


## Preprocess the text

In [46]:
train['content'] = train['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [47]:
train = train.dropna()
test = test.dropna()

In [48]:
train.head()

Unnamed: 0,content,score,upvotes
0,qesey,5,0
1,nomre ile qeydiyatdan kece bilmiremkecirem mes...,1,0
2,yukluye bilmirem komek ede bilersiz,5,0
3,gozel,5,1
4,xais edirem problemi hell edinde heftede 2 def...,5,0


## Tokenize the content

In [49]:
tokenizer = Tokenizer()
tokenizer.fit(train['content'].values)
tokenizer.fit(train['content'].values)
tokenizer.vocab_size

49350

In [50]:
tokenizer.keep_top_k(1000)
tokenizer.vocab_size

1000

In [51]:
tokenizer.transform(train['content'].values)

[[1],
 [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [12, 13, 14],
 [15],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 [],
 [26, 27, 28],
 [29, 30, 31, 32],
 [33, 4, 34, 35, 36, 37, 33, 38, 39, 40, 41],
 [42, 43],
 [44],
 [45, 46],
 [47, 48, 49, 50, 51, 52, 53],
 [54],
 [],
 [55],
 [55],
 [],
 [55, 56],
 [27],
 [55],
 [15, 57, 58, 40, 59, 3, 60, 5, 61],
 [],
 [62, 8],
 [63],
 [64, 65, 66, 64, 67],
 [68, 69, 70, 71, 36, 72, 23, 73, 74, 16],
 [75, 76, 77],
 [15],
 [78, 79, 80, 81],
 [],
 [],
 [],
 [],
 [],
 [8],
 [82, 31],
 [83],
 [84, 63],
 [],
 [55],
 [85],
 [86, 87],
 [88],
 [89, 90, 69, 91, 92, 93],
 [],
 [94, 95, 96],
 [],
 [97],
 [55, 98, 83, 55, 99],
 [100, 101, 102],
 [103, 87],
 [85, 104, 67, 105, 106, 105, 107, 16, 17],
 [],
 [],
 [108, 109, 110, 111, 112, 113, 114, 77, 115, 73, 116, 30],
 [63],
 [],
 [8, 117],
 [55],
 [118],
 [],
 [55],
 [83, 110],
 [],
 [119, 120],
 [121,
  122,
  123,
  124,
  96,
  110,
  125,
  126,
  127,
  128,
  129,
  130,
  51,
  52,
  131,
  121,
  132,
  3,
  1

In [52]:
# length distribution of the tokenized sequences
lengths = [len(x) for x in tokenizer.transform(train['content'].values)]
np.mean(lengths), np.std(lengths), np.max(lengths)

(2.6550639119278725, 4.129762083696163, 156)

In [53]:
target_length = 20

In [54]:
train['content'] = tokenizer.transform(train['content'].values)
# drop empty sequences
train = train[train['content'].apply(len) > 0]
# pad the sequences
train['content'] = train['content'].apply(lambda x: np.array(x[:target_length] + [0]*(target_length-len(x))))

test['content'] = tokenizer.transform(test['content'].values)
# drop empty sequences
test = test[test['content'].apply(len) > 0]
# pad the sequences
test['content'] = test['content'].apply(lambda x: np.array(x[:target_length] + [0]*(target_length-len(x))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['content'] = test['content'].apply(lambda x: np.array(x[:target_length] + [0]*(target_length-len(x))))


In [55]:
train.head()

Unnamed: 0,content,score,upvotes
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,0
1,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0...",1,0
2,"[12, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,0
3,"[15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",5,1
4,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0,...",5,0


In [56]:
tokenizer.save('tokenizer.json')

## Split the dataset into train and validation

In [57]:
val = train.sample(frac=0.2, random_state=42)
train = train.drop(val.index)

In [58]:
train['content'].values.shape, val['content'].values.shape, test['content'].values.shape

((79336,), (19834,), (24807,))

In [59]:
X_train = np.vstack(train['content'].values)
X_val = np.vstack(val['content'].values)
X_test = np.vstack(test['content'].values)

In [60]:
X_train.shape

(79336, 20)

## Normalize the labels

In [61]:
# scale the labels to 0-1 range based on the fact that the original scores are in 1-5 range
def scale_label(label):
    return (label-1)/4

In [62]:
train_labels = train['score'].apply(scale_label).values
val_labels = val['score'].apply(scale_label).values
test_labels = test['score'].apply(scale_label).values

In [63]:
y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

## Train the model

In [64]:
model = Sequential()
model.add(Embedding(1001, 64, input_length=20))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [65]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [67]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=2)

Epoch 1/10
2480/2480 - 56s - loss: 0.2959 - accuracy: 0.8245 - val_loss: 0.2679 - val_accuracy: 0.8370 - 56s/epoch - 23ms/step
Epoch 2/10
2480/2480 - 51s - loss: 0.2751 - accuracy: 0.8355 - val_loss: 0.2656 - val_accuracy: 0.8408 - 51s/epoch - 21ms/step
Epoch 3/10
2480/2480 - 51s - loss: 0.2656 - accuracy: 0.8400 - val_loss: 0.2585 - val_accuracy: 0.8424 - 51s/epoch - 21ms/step
Epoch 4/10
2480/2480 - 52s - loss: 0.2595 - accuracy: 0.8435 - val_loss: 0.2597 - val_accuracy: 0.8435 - 52s/epoch - 21ms/step
Epoch 5/10
2480/2480 - 52s - loss: 0.2537 - accuracy: 0.8459 - val_loss: 0.2608 - val_accuracy: 0.8428 - 52s/epoch - 21ms/step
Epoch 6/10
2480/2480 - 54s - loss: 0.2500 - accuracy: 0.8485 - val_loss: 0.2643 - val_accuracy: 0.8406 - 54s/epoch - 22ms/step
Epoch 7/10
2480/2480 - 54s - loss: 0.2458 - accuracy: 0.8506 - val_loss: 0.2641 - val_accuracy: 0.8439 - 54s/epoch - 22ms/step
Epoch 8/10
2480/2480 - 53s - loss: 0.2447 - accuracy: 0.8518 - val_loss: 0.2678 - val_accuracy: 0.8424 - 53s/ep

In [68]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 84.060949


In [69]:
# save the model
model.save('model.keras')

In [78]:
from tensorflow import keras

tokenizer1 = Tokenizer()
tokenizer1.load('tokenizer.json')
model = keras.models.load_model('model.keras')


def predict_sentiment(sentence: str) -> float:
    """
    Predict the sentiment of the given sentence
    :param sentence: input sentence
    :return: sentiment score
    """
    sentence = preprocess_text(sentence)
    tokens = tokenizer.transform_single(sentence)
    tokens = keras.preprocessing.sequence.pad_sequences([tokens], maxlen=20, padding='post', truncating='post')[0]
    return model.predict(tokens.reshape(1, 20))[0][0]

test_sentence = "Men her sheyden razi qaldim!"
print(predict_sentiment(test_sentence))
test_sentence2 = "Zibil kimi isleyir"
print(predict_sentiment(test_sentence2))
test_sentence3 = "Butun gunu dexlisiz zengler gelir, artiq bezdim"
print(predict_sentiment(test_sentence3))


0.9966212
0.1087859
0.1545349
