# Deep Learning Model

## Preparing Data

In [1]:
from model import Model,DataSet
from transformers import AutoTokenizer
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torchmetrics import Accuracy
from torchmetrics.classification import BinaryAccuracy
import numpy as np
import torch.nn.functional as F
from torchinfo import summary
import pandas as pd
from sklearn.utils import shuffle
from zemberek import TurkishTokenizer
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import sklearn


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ytu-ce-cosmos/turkish-gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

##### Train

In [3]:
df_train = pd.read_csv("/home/musasina/projects/teknofest/msnet/datasets/train_final.csv")
df_train["text"] = df_train["text"].convert_dtypes(convert_string=True)



In [4]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=16,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [5]:
encoded_text = df_train["text"].map(preprocess).to_list()

In [6]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return list(reversed(results))

In [7]:
df_train["labels"] = df_train["labels"].map(convert_to_list)

In [8]:
labels = np.array(df_train["labels"].to_list())

In [9]:
zero_count = (labels==0).sum()
notr_count = (labels==1).sum()
negative_count = (labels==2).sum()
positive_count = (labels==3).sum()

In [10]:
encoded_labels = []
labels = np.array(df_train["labels"].to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [11]:
train_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
train_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
train_text = train_text.type(dtype=torch.float32)

##### Test

In [None]:
df_test = pd.read_csv("/home/musasina/projects/teknofest/msnet/datasets/test_final.csv")
df_test["text"] = df_test["text"].convert_dtypes(convert_string=True)



In [None]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=16,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [None]:
encoded_text = df_test["text"].map(preprocess).to_list()

In [None]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return list(reversed(results))

In [None]:
df_test["labels"] = df_test["labels"].map(convert_to_list)

In [None]:
labels = np.array(df_test["labels"].to_list())

In [None]:
zero_count = (labels==0).sum()
notr_count = (labels==1).sum()
negative_count = (labels==2).sum()
positive_count = (labels==3).sum()

In [None]:
encoded_labels = []
labels = np.array(df_test["labels"].to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [None]:
test_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
test_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
test_text = test_text.type(dtype=torch.float32)

##### Prepare Training

In [12]:
model = Model().to("cuda")
summary(model)

Layer (type:depth-idx)                                  Param #
Model                                                   --
├─MFP: 1-1                                              --
│    └─LeakyReLU: 2-1                                   --
│    └─ReLU: 2-2                                        --
│    └─Sigmoid: 2-3                                     --
│    └─Tanh: 2-4                                        --
│    └─Mish: 2-5                                        --
│    └─SiLU: 2-6                                        --
│    └─SELU: 2-7                                        --
│    └─ELU: 2-8                                         --
│    └─GELU: 2-9                                        --
│    └─Softplus: 2-10                                   --
│    └─Linear: 2-11                                     1,049,600
│    └─Linear: 2-12                                     1,049,600
│    └─Linear: 2-13                                     1,049,600
│    └─Linear: 2-14           

In [13]:
train_dataset = DataSet(train_text,train_label)
test_dataset = DataSet(test_text,test_label)

In [14]:
train_dataloader = DataLoader(dataset=train_dataset,shuffle=True,batch_size=128)
test_dataloader = DataLoader(dataset=test_dataset,shuffle=False,batch_size=128)

In [15]:
all_count = notr_count + negative_count + positive_count + zero_count

In [16]:
zero_weight = 1.0-((zero_count)/all_count)
notr_weight = 1.0-((notr_count)/all_count)
negative_weight = 1.0-(negative_count/all_count)
positive_weight = 1.0-(positive_count/all_count)

In [17]:
weights = [zero_weight,notr_weight,negative_weight,positive_weight]

In [18]:
print(weights)

[0.8207141086281916, 0.7515735293298346, 0.7408805635874379, 0.6868317984545358]


In [19]:
loss_scale_factor = np.sum(weights)

In [20]:
optimezer = torch.optim.AdamW(model.parameters(),lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimezer,end_factor=1e-5,total_iters=10)
loss = torch.nn.CrossEntropyLoss(weight=torch.Tensor(weights)).to("cuda")

In [None]:
acc = Accuracy("multiclass",num_classes=4).to("cuda")

In [21]:
tensorboard = SummaryWriter("./log",comment="model")

### Spell funcs

In [22]:
import logging

from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker,
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)

def correct_spells(text: str):

    words = text.split(" ")
    for i, word in enumerate(words):
        try:
            words[i] = spell_checker.suggest_for_word(word)[0]
        except BaseException:
            continue
    return " ".join(words)

2024-07-28 19:00:52,920 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 2.204500675201416



In [23]:
def normalize_long_text(paragraph: str) -> str:
    result = []
    for sentence in paragraph:
        try:
            result.append(normalizer.normalize(sentence))
        except BaseException:
            continue
    return " ".join(result)

In [24]:
from nltk.corpus import stopwords
import re
import nltk

nltk.download("stopwords")
stops = set(stopwords.words("turkish"))

def clear_stop_words(sentence):
    return [word for word in sentence.split(" ") if word not in stops]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()

def lemmatize_sent(text):
    result = []
    for word in text:
        try:
            result.append(analyzer.lemmatize(word)[0][1][0])
        except BaseException:
            result.append(word)
    return " ".join(result)

In [26]:
def clear_sentence(text:str):
    text = text.casefold()
    text = correct_spells(text)
    text = normalize_long_text(text)
    text = clear_stop_words(text)
    text = lemmatize_sent(text)
    return text

### Train

In [27]:
for epoch in tqdm(range(0,10)):
    
    for i,(x,y) in enumerate(tqdm(train_dataloader)):
        model.train()
        y = y.to("cuda").squeeze()
        final = model(x)
        losses = loss(final[0].squeeze(),y[0].float())
        for j in range(1,len(y)):
            losses.add_(loss(final[j].squeeze(),y[j].float()))
        losses.div_(len(y))
        model.eval()
        with torch.inference_mode():
            tensorboard.add_scalar(tag="loss",scalar_value=losses.cpu().item()/loss_scale_factor,global_step=(epoch*len(train_dataloader))+i)
        model.train()
        optimezer.zero_grad()
        losses.backward()
        #[loss_back.backward(retain_graph=True) for loss_back in losses]
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1.0,error_if_nonfinite=True)
        optimezer.step()
    scheduler.step()
    torch.save({"model_state_dict":model.state_dict(),
            "optimezer_state_dict": optimezer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "loss": loss.state_dict(),
            "epoch":epoch},"./model.pth")
    model.eval()
    with torch.inference_mode():
        results_acc = []
        print("started testing ...")
        for i,(x,y) in enumerate(tqdm(test_dataloader)):
            y = y.to("cuda")
            results = model(x)
            results_acc.append(acc(results.argmax(dim=2).view(-1),y.view(-1)).cpu().item())
        print("test -> ", np.mean(results_acc))


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1403 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 

  0%|          | 0/1403 [00:00<?, ?it/s]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 

  0%|          | 0/1403 [00:00<?, ?it/s]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')


In [None]:
model.eval()
with torch.inference_mode():
    text = "turkcell müşteri hizmetlerinden istediğim verimi aldım"
    tokens = tokenizer([text], max_length=16,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
    print(model(tokens).argmax(dim=2).view(-1))

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        results_acc.append(acc(results.argmax(dim=1),y).cpu().item())
    print("test -> ", np.mean(results_acc))

started testing ...


NameError: name 'test_dataloader' is not defined

In [None]:
loadded = torch.load("./model.pth")

In [None]:
model.load_state_dict(loadded["model_state_dict"])
optimezer.load_state_dict(loadded['optimezer_state_dict'])
scheduler.load_state_dict(loadded["scheduler_state_dict"])
loss.load_state_dict(loadded["loss"])
start = loadded["epoch"]