# Deep Learning Model

## Preparing Data

In [1]:
from model_2 import Model,DataSet
from transformers import AutoTokenizer
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torchmetrics import Accuracy,F1Score
from torchmetrics.classification import BinaryAccuracy
import numpy as np
import torch.nn.functional as F
from torchinfo import summary
import pandas as pd
from sklearn.utils import shuffle
from zemberek import TurkishTokenizer
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import sklearn


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ytu-ce-cosmos/turkish-gpt2")
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

0

In [3]:
import sklearn.utils


df = pd.read_csv("/home/musasina/projects/teknofest/msnet/datasets/train_final.csv")
df_notr = df[df["sentiment"]==1].sample(137555,random_state=42)
df_pos = df[df["sentiment"]==3].sample(137555,random_state=42)
df_neg = df[df["sentiment"]==2].sample(137555,random_state=42)
df = sklearn.utils.shuffle(pd.concat((df_neg,df_notr,df_pos),ignore_index=True),random_state=42)

In [4]:
print(len(df_notr))
print(len(df_pos))
print(len(df_neg))

137555
137555
137555


In [5]:
import sklearn.model_selection
import sklearn.preprocessing


X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df["text"],df["labels"],test_size=0.1,random_state=42,shuffle=True)

##### Train

In [6]:
X_train = X_train.convert_dtypes(convert_string=True)

In [7]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=32,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [8]:
encoded_text = X_train.map(preprocess).to_list()

In [9]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return results

In [10]:
y_train = y_train.map(convert_to_list)

In [11]:
labels = np.array(y_train.to_list())

In [12]:
zero_count = (labels==0).sum()
notr_count = (labels==1).sum()
negative_count = (labels==2).sum()
positive_count = (labels==3).sum()

In [13]:
encoded_labels = []
labels = np.array(y_train.to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [14]:
train_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
train_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
train_text = train_text.type(dtype=torch.float32)

##### Test

In [15]:
X_test = X_test.convert_dtypes(convert_string=True)



In [16]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=32,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [17]:
encoded_text = X_test.map(preprocess).to_list()

In [18]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return results

In [19]:
y_test = y_test.map(convert_to_list)

In [20]:
labels = np.array(y_test.to_list())

In [21]:
encoded_labels = []
labels = np.array(y_test.to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [22]:
test_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
test_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
test_text = test_text.type(dtype=torch.float32)

##### Prepare Training

In [23]:
model = Model().to("cuda")
summary(model)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Layer (type:depth-idx)                                            Param #
Model                                                             --
├─MFP: 1-1                                                        --
│    └─LeakyReLU: 2-1                                             --
│    └─ReLU: 2-2                                                  --
│    └─Sigmoid: 2-3                                               --
│    └─Tanh: 2-4                                                  --
│    └─Mish: 2-5                                                  --
│    └─SiLU: 2-6                                                  --
│    └─SELU: 2-7                                                  --
│    └─ELU: 2-8                                                   --
│    └─GELU: 2-9                                                  --
│    └─Softplus: 2-10                                             --
│    └─Linear: 2-11                                               393,728
│    └─Linear: 2-12     

In [24]:
train_dataset = DataSet(train_text,train_label)
test_dataset = DataSet(test_text,test_label)

In [25]:
train_dataloader = DataLoader(dataset=train_dataset,shuffle=True,batch_size=16)
test_dataloader = DataLoader(dataset=test_dataset,shuffle=False,batch_size=16)

In [26]:
all_count = notr_count + negative_count + positive_count + zero_count

In [27]:
zero_weight = 1.0-((zero_count)/all_count)
notr_weight = 1.0-((notr_count)/all_count)
negative_weight = 1.0-(negative_count/all_count)
positive_weight = 1.0-(positive_count/all_count)

In [28]:
weights = [zero_weight,notr_weight,negative_weight,positive_weight]

In [29]:
print(weights)

[0.8343169759934087, 0.7590275459210873, 0.7116483698081304, 0.6950071082773737]


In [30]:
loss_scale_factor = (np.sum(weights)-1)/4

In [31]:
optimezer = torch.optim.AdamW(model.parameters(),lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimezer,end_factor=1e-8,total_iters=25)
loss = torch.nn.CrossEntropyLoss(weight=torch.Tensor(weights)).to("cuda")

In [32]:
acc = Accuracy("multilabel",num_labels=4).to("cuda")
f1 = F1Score("multilabel",num_labels=4).to("cuda")

In [33]:
tensorboard = SummaryWriter("./log",comment="model")

### Spell funcs

In [34]:
import logging

from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker,
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)

def correct_spells(text: str):

    words = text.split(" ")
    for i, word in enumerate(words):
        try:
            words[i] = spell_checker.suggest_for_word(word)[0]
        except BaseException:
            continue
    return " ".join(words)

2024-08-01 20:46:10,426 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 2.2803738117218018



In [35]:
def normalize_long_text(paragraph: str) -> str:
    result = []
    for sentence in paragraph:
        try:
            result.append(normalizer.normalize(sentence))
        except BaseException:
            continue
    return " ".join(result)

In [36]:
from nltk.corpus import stopwords
import re
import nltk

nltk.download("stopwords")
stops = set(stopwords.words("turkish"))

def clear_stop_words(sentence):
    return [word for word in sentence.split(" ") if word not in stops]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()

def lemmatize_sent(text):
    result = []
    for word in text:
        try:
            result.append(analyzer.lemmatize(word)[0][1][0])
        except BaseException:
            result.append(word)
    return " ".join(result)

In [38]:
def clear_sentence(text:str):
    text = text.casefold()
    text = correct_spells(text)
    text = normalize_long_text(text)
    text = clear_stop_words(text)
    text = lemmatize_sent(text)
    return text

### Train

In [39]:
past_acc = 0
for epoch in tqdm(range(0,25)):
    
    for i,(x,y) in enumerate(tqdm(train_dataloader)):
        model.train()
        y = y.to("cuda")
        final = model(x)
        losses = loss(final[0],y[0].float())
        for j in range(1,len(y)):
            losses.add_(loss(final[j],y[j].float()))
        losses.div_(len(y))
        model.eval()
        with torch.inference_mode():
            tensorboard.add_scalar(tag="loss",scalar_value=losses.cpu().item()-loss_scale_factor,global_step=(epoch*len(train_dataloader))+i)
        model.train()
        optimezer.zero_grad()
        losses.backward()
        #[loss_back.backward(retain_graph=True) for loss_back in losses]
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1.0,error_if_nonfinite=True)
        optimezer.step()
        if (i+1)%200==0:
            model.eval()
            with torch.inference_mode():
                text = "turkcell müşteri hizmetlerinden istediğim verimi aldım fakat daha iyi olabilirdi memnun kalmadım ama kick ten memnun kaldim bana yardımcı oldular"
                tokens = tokenizer([text], max_length=32,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
                print(model(tokens).argmax(dim=2).view(-1))
    scheduler.step()

    model.eval()
    with torch.inference_mode():
        results_acc = []
        results_f1 = []
        print("started testing ...")
        for i,(x,y) in enumerate(tqdm(test_dataloader)):
            y = y.to("cuda")
            results = model(x)
            for result,inner_y in zip(results,y):
                index = (inner_y.argmax(dim=1)!=0).squeeze()
                results_acc.append(acc(result[index],inner_y[index]).cpu().item())
                results_f1.append(f1(result[index],inner_y[index]).cpu().item())
        print("test acc -> ", np.mean(results_acc))
        print("test f1 -> ", np.mean(results_f1))
    if past_acc < np.mean(results_acc):
        torch.save({"model_state_dict":model.state_dict(),
            "optimezer_state_dict": optimezer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "loss": loss.state_dict(),
            "epoch":epoch},"./model_2.pth")
        past_acc = np.mean(results_acc)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/23213 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        2, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
        3, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1,
        1, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1,
        1, 3, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1,
        1,

KeyboardInterrupt: 

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    results_f1 = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        for result,inner_y in zip(results,y):
            index = (inner_y.argmax(dim=1)!=0).squeeze()
            results_acc.append(acc(result[index],inner_y[index]).cpu().item())
            results_f1.append(f1(result[index],inner_y[index]).cpu().item())
    print("test acc -> ", np.mean(results_acc))
    print("test f1 -> ", np.mean(results_f1))

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    results_f1 = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        for result,inner_y in zip(results,y):
            results_acc.append(acc(result,inner_y).cpu().item())
            results_f1.append(f1(result,inner_y).cpu().item())
    print("test acc -> ", np.mean(results_acc))
    print("test f1 -> ", np.mean(results_f1))

In [None]:
loadded = torch.load("./model.pth")

In [None]:
model.load_state_dict(loadded["model_state_dict"])
optimezer.load_state_dict(loadded['optimezer_state_dict'])
scheduler.load_state_dict(loadded["scheduler_state_dict"])
loss.load_state_dict(loadded["loss"])
start = loadded["epoch"]