# Deep Learning Model

## Preparing Data

In [1]:
from model import Model,DataSet
from transformers import AutoTokenizer
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torchmetrics import Accuracy,F1Score
from torchmetrics.classification import BinaryAccuracy
import numpy as np
import torch.nn.functional as F
from torchinfo import summary
import pandas as pd
from sklearn.utils import shuffle
from zemberek import TurkishTokenizer
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import sklearn


In [2]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [3]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df_train = pd.read_csv("hf://datasets/winvoker/turkish-sentiment-analysis-dataset/" + splits["train"])
df_test = pd.read_csv("hf://datasets/winvoker/turkish-sentiment-analysis-dataset/" + splits["test"])

In [4]:
df_train_pos = df_train[df_train["label"]=="Positive"].sample(50905,random_state=42)
df_train_neg = df_train[df_train["label"]=="Negative"].sample(50905,random_state=42)
df_train_notr = df_train[df_train["label"]=="Notr"].sample(50905,random_state=42)

In [5]:
import sklearn.preprocessing
import sklearn.utils


df_train = sklearn.utils.shuffle(pd.concat((df_train_pos,df_train_neg,df_train_notr),ignore_index=True),random_state=42)

##### Train

In [6]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=128,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [7]:
encoded_text = df_train["text"].map(preprocess).to_list()

In [8]:
labels = np.array(df_train["label"].to_list())

In [9]:
encoded_labels = []
factorized = pd.factorize(df_train["label"],sort=True)
print(factorized[1])
train_label = F.one_hot(torch.tensor(factorized[0],dtype=torch.long),num_classes=3).to(torch.float32)

Index(['Negative', 'Notr', 'Positive'], dtype='object')


In [10]:
train_text = torch.Tensor(np.array(encoded_text))
train_text = train_text.type(dtype=torch.float32)

##### Test

In [11]:
df_test["text"] = df_test["text"].convert_dtypes(convert_string=True)



In [12]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=128,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [13]:
encoded_text = df_test["text"].map(preprocess).to_list()

In [14]:
labels = np.array(df_test["label"].to_list())

In [15]:
encoded_labels = []
factorized = pd.factorize(df_test["label"],sort=True)
print(factorized[1])
test_label = F.one_hot(torch.tensor(factorized[0],dtype=torch.long),num_classes=3)

Index(['Negative', 'Notr', 'Positive'], dtype='object')


In [16]:
test_text = torch.Tensor(np.array(encoded_text))
test_text = test_text.type(dtype=torch.float32)

##### Prepare Training

In [17]:
model = Model().to("cuda")
summary(model)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Layer (type:depth-idx)                                            Param #
Model                                                             --
├─MFP: 1-1                                                        --
│    └─LeakyReLU: 2-1                                             --
│    └─ReLU: 2-2                                                  --
│    └─Sigmoid: 2-3                                               --
│    └─Tanh: 2-4                                                  --
│    └─Mish: 2-5                                                  --
│    └─SiLU: 2-6                                                  --
│    └─SELU: 2-7                                                  --
│    └─ELU: 2-8                                                   --
│    └─GELU: 2-9                                                  --
│    └─Softplus: 2-10                                             --
│    └─Linear: 2-11                                               983,808
│    └─Linear: 2-12     

In [18]:
train_dataset = DataSet(train_text,train_label)
test_dataset = DataSet(test_text,test_label)

In [19]:
train_dataloader = DataLoader(dataset=train_dataset,shuffle=True,batch_size=128)
test_dataloader = DataLoader(dataset=test_dataset,shuffle=False,batch_size=128)

In [20]:
optimezer = torch.optim.AdamW(model.parameters(),lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimezer,end_factor=1e-8,total_iters=25)
loss = torch.nn.CrossEntropyLoss().to("cuda")

In [21]:
acc = Accuracy("multilabel",num_labels=3).to("cuda")
f1 = F1Score("multilabel",num_labels=3).to("cuda")

In [22]:
tensorboard = SummaryWriter("./log",comment="model")

### Spell funcs

In [23]:
import logging

from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker,
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)

def correct_spells(text: str):

    words = text.split(" ")
    for i, word in enumerate(words):
        try:
            words[i] = spell_checker.suggest_for_word(word)[0]
        except BaseException:
            continue
    return " ".join(words)

2024-07-31 12:02:22,559 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 2.0554051399230957



In [24]:
def normalize_long_text(paragraph: str) -> str:
    result = []
    for sentence in paragraph:
        try:
            result.append(normalizer.normalize(sentence))
        except BaseException:
            continue
    return " ".join(result)

In [25]:
from nltk.corpus import stopwords
import re
import nltk

nltk.download("stopwords")
stops = set(stopwords.words("turkish"))

def clear_stop_words(sentence):
    return [word for word in sentence.split(" ") if word not in stops]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()

def lemmatize_sent(text):
    result = []
    for word in text:
        try:
            result.append(analyzer.lemmatize(word)[0][1][0])
        except BaseException:
            result.append(word)
    return " ".join(result)

In [27]:
def clear_sentence(text:str):
    text = text.casefold()
    text = correct_spells(text)
    text = normalize_long_text(text)
    text = clear_stop_words(text)
    text = lemmatize_sent(text)
    return text

### Train

In [28]:
past_acc = 0
for epoch in tqdm(range(0,25)):
    for i,(x,y) in enumerate(tqdm(train_dataloader)):
        model.train()
        y = y.to("cuda")
        final = model(x)
        losses = loss(final,y)
        model.eval()
        with torch.inference_mode():
            tensorboard.add_scalar(tag="loss",scalar_value=losses.cpu().item(),global_step=(epoch*len(train_dataloader))+i)
        model.train()
        optimezer.zero_grad()
        losses.backward()
        #[loss_back.backward(retain_graph=True) for loss_back in losses]
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1.0,error_if_nonfinite=True)
        optimezer.step()
        if (i+1)%100 == 0:
            model.eval()
            with torch.inference_mode():
                text = "turkcell müşteri hizmetlerinden istediğim verimi aldım"
                tokens = tokenizer([text], max_length=128,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
                print(model(tokens).argmax(dim=1))
    scheduler.step()
    model.eval()
    with torch.inference_mode():
        results_acc = []
        results_f1 = []
        print("started testing ...")
        for i,(x,y) in enumerate(tqdm(test_dataloader)):
            y = y.to("cuda")
            result = model(x)
            results_acc.append(acc(result,y).cpu().item())
            results_f1.append(f1(result,y).cpu().item())
        print("test acc -> ", np.mean(results_acc))
        print("test f1 -> ", np.mean(results_f1))
    if past_acc < np.mean(results_acc):
        torch.save({"model_state_dict":model.state_dict(),
                "optimezer_state_dict": optimezer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
                "loss": loss.state_dict(),
                "epoch":epoch},"./model.pth")
        past_acc = np.mean(results_acc)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/1194 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
model.eval()
with torch.inference_mode():
    text = "turkcell müşteri hizmetlerinden istediğim verimi aldım"
    tokens = tokenizer([text], max_length=8,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
    print(model(tokens).argmax(dim=2).view(-1))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x80 and 1280x768)

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    results_f1 = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        for result,inner_y in zip(results,y):
            results_acc.append(acc(result,inner_y).cpu().item())
            results_f1.append(f1(result,inner_y).cpu().item())
    print("test acc -> ", np.mean(results_acc))
    print("test f1 -> ", np.mean(results_f1))

started testing ...


  0%|          | 0/192 [00:00<?, ?it/s]

test acc ->  0.27889183438112347
test f1 ->  0.40866448055492904


In [None]:
loadded = torch.load("./model.pth")

In [None]:
model.load_state_dict(loadded["model_state_dict"])
optimezer.load_state_dict(loadded['optimezer_state_dict'])
scheduler.load_state_dict(loadded["scheduler_state_dict"])
loss.load_state_dict(loadded["loss"])
start = loadded["epoch"]