# Deep Learning Model

## Preparing Data

In [38]:
from model_2 import Model,DataSet
from transformers import AutoTokenizer
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torchmetrics import Accuracy,F1Score
from torchmetrics.classification import BinaryAccuracy
import numpy as np
import torch.nn.functional as F
from torchinfo import summary
import pandas as pd
from sklearn.utils import shuffle
from zemberek import TurkishTokenizer
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import sklearn


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/mGPT")
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

In [41]:
import sklearn.utils


df = pd.read_csv("/home/musasina/projects/teknofest/msnet/datasets/train_final.csv")
df_notr = df[df["sentiment"]==1].sample(1664,random_state=42)
df_pos = df[df["sentiment"]==3].sample(1664,random_state=42)
df_neg = df[df["sentiment"]==2].sample(1664,random_state=42)
df = sklearn.utils.shuffle(pd.concat((df_neg,df_notr,df_pos),ignore_index=True),random_state=42)

In [3]:
split = len(df)*80//100
df_train = df.iloc[:split,:]
df_test = df.iloc[split:,:]

##### Train

In [4]:
df_train["text"] = df_train["text"].convert_dtypes(convert_string=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["text"] = df_train["text"].convert_dtypes(convert_string=True)


In [5]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=128,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [6]:
encoded_text = df_train["text"].map(preprocess).to_list()

In [7]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return results

In [8]:
df_train["labels"] = df_train["labels"].map(convert_to_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["labels"] = df_train["labels"].map(convert_to_list)


In [9]:
labels = np.array(df_train["labels"].to_list())

In [10]:
zero_count = (labels==0).sum()
notr_count = (labels==1).sum()
negative_count = (labels==2).sum()
positive_count = (labels==3).sum()

In [11]:
encoded_labels = []
labels = np.array(df_train["labels"].to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [12]:
train_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
train_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
train_text = train_text.type(dtype=torch.float32)

##### Test

In [13]:
df_test["text"] = df_test["text"].convert_dtypes(convert_string=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["text"] = df_test["text"].convert_dtypes(convert_string=True)


In [14]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=128,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [15]:
encoded_text = df_test["text"].map(preprocess).to_list()

In [16]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return results

In [17]:
df_test["labels"] = df_test["labels"].map(convert_to_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["labels"] = df_test["labels"].map(convert_to_list)


In [18]:
labels = np.array(df_test["labels"].to_list())

In [19]:
encoded_labels = []
labels = np.array(df_test["labels"].to_list())
for row in labels:
    encoded_labels.append(np.eye(4)[row])
encoded_labels = np.array(encoded_labels)

In [20]:
test_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
test_label = torch.as_tensor(encoded_labels,dtype=torch.long,device="cuda")
test_text = test_text.type(dtype=torch.float32)

##### Prepare Training

In [21]:
model = Model().to("cuda")
summary(model)

Layer (type:depth-idx)                                  Param #
Model                                                   --
├─MFP: 1-1                                              --
│    └─LeakyReLU: 2-1                                   --
│    └─ReLU: 2-2                                        --
│    └─Sigmoid: 2-3                                     --
│    └─Tanh: 2-4                                        --
│    └─Mish: 2-5                                        --
│    └─SiLU: 2-6                                        --
│    └─SELU: 2-7                                        --
│    └─ELU: 2-8                                         --
│    └─GELU: 2-9                                        --
│    └─Softplus: 2-10                                   --
│    └─Linear: 2-11                                     2,622,720
│    └─Linear: 2-12                                     2,622,720
│    └─Linear: 2-13                                     2,622,720
│    └─Linear: 2-14           

In [22]:
train_dataset = DataSet(train_text,train_label)
test_dataset = DataSet(test_text,test_label)

In [23]:
train_dataloader = DataLoader(dataset=train_dataset,shuffle=True,batch_size=4)
test_dataloader = DataLoader(dataset=test_dataset,shuffle=False,batch_size=4)

In [24]:
all_count = notr_count + negative_count + positive_count + zero_count

In [25]:
zero_weight = 1.0-((zero_count)/all_count)
notr_weight = 1.0-((notr_count)/all_count)
negative_weight = 1.0-(negative_count/all_count)
positive_weight = 1.0-(positive_count/all_count)

In [26]:
weights = [zero_weight,notr_weight,negative_weight,positive_weight]

In [27]:
print(weights)

[0.8268611432232856, 0.8893780861293329, 0.8249643778900957, 0.4587963927572857]


In [28]:
loss_scale_factor = np.sum(weights)

In [29]:
optimezer = torch.optim.AdamW(model.parameters(),lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimezer,end_factor=1e-8,total_iters=10)
loss = torch.nn.CrossEntropyLoss(weight=torch.Tensor(weights)).to("cuda")

In [30]:
acc = Accuracy("multilabel",num_labels=4).to("cuda")
f1 = F1Score("multilabel",num_labels=4).to("cuda")

In [31]:
tensorboard = SummaryWriter("./log",comment="model")

### Spell funcs

In [32]:
import logging

from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker,
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)

def correct_spells(text: str):

    words = text.split(" ")
    for i, word in enumerate(words):
        try:
            words[i] = spell_checker.suggest_for_word(word)[0]
        except BaseException:
            continue
    return " ".join(words)

2024-07-31 16:14:35,698 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 2.578439235687256



In [33]:
def normalize_long_text(paragraph: str) -> str:
    result = []
    for sentence in paragraph:
        try:
            result.append(normalizer.normalize(sentence))
        except BaseException:
            continue
    return " ".join(result)

In [34]:
from nltk.corpus import stopwords
import re
import nltk

nltk.download("stopwords")
stops = set(stopwords.words("turkish"))

def clear_stop_words(sentence):
    return [word for word in sentence.split(" ") if word not in stops]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()

def lemmatize_sent(text):
    result = []
    for word in text:
        try:
            result.append(analyzer.lemmatize(word)[0][1][0])
        except BaseException:
            result.append(word)
    return " ".join(result)

In [36]:
def clear_sentence(text:str):
    text = text.casefold()
    text = correct_spells(text)
    text = normalize_long_text(text)
    text = clear_stop_words(text)
    text = lemmatize_sent(text)
    return text

### Train

In [37]:
past_acc = 0
for epoch in tqdm(range(0,10)):
    
    for i,(x,y) in enumerate(tqdm(train_dataloader)):
        model.train()
        y = y.to("cuda")
        final = model(x)
        losses = loss(final[0],y[0].float())
        for j in range(1,len(y)):
            losses.add_(loss(final[j],y[j].float()))
        losses.div_(len(y))
        model.eval()
        with torch.inference_mode():
            tensorboard.add_scalar(tag="loss",scalar_value=losses.cpu().item(),global_step=(epoch*len(train_dataloader))+i)
        model.train()
        optimezer.zero_grad()
        losses.backward()
        #[loss_back.backward(retain_graph=True) for loss_back in losses]
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1.0,error_if_nonfinite=True)
        optimezer.step()
        if (i+1)%200==0:
            model.eval()
            with torch.inference_mode():
                text = "turkcell müşteri hizmetlerinden istediğim verimi aldım fakat daha iyi olabilirdi memnun kalmadım ama kick ten memnun kaldim bana yardımcı oldular"
                tokens = tokenizer([text], max_length=128,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
                print(model(tokens).argmax(dim=2).view(-1))
    scheduler.step()

    model.eval()
    with torch.inference_mode():
        results_acc = []
        results_f1 = []
        print("started testing ...")
        for i,(x,y) in enumerate(tqdm(test_dataloader)):
            y = y.to("cuda")
            results = model(x)
            for result,inner_y in zip(results,y):
                results_acc.append(acc(result,inner_y).cpu().item())
                results_f1.append(f1(result,inner_y).cpu().item())
        print("test acc -> ", np.mean(results_acc))
        print("test f1 -> ", np.mean(results_f1))
    if past_acc < np.mean(results_acc):
        torch.save({"model_state_dict":model.state_dict(),
            "optimezer_state_dict": optimezer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "loss": loss.state_dict(),
            "epoch":epoch},"./model_2.pth")
        past_acc = np.mean(results_acc)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/13949 [00:00<?, ?it/s]

tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
tensor([0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 

KeyboardInterrupt: 

In [None]:
model.eval()
with torch.inference_mode():
    text = "turkcell müşteri hizmetlerinden istediğim verimi aldım"
    tokens = tokenizer([text], max_length=128,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]
    print(tokens)
    print(model(tokens).argmax(dim=2).view(-1))

tensor([[25300,    81, 18288,   297, 26898,  5330, 75830, 54183,  1105, 12422,
          1831,   361,   809,  3532, 14626,  6417,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,  

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    results_f1 = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        for result,inner_y in zip(results,y):
            results_acc.append(acc(result,inner_y).cpu().item())
            results_f1.append(f1(result,inner_y).cpu().item())
    print("test acc -> ", np.mean(results_acc))
    print("test f1 -> ", np.mean(results_f1))

started testing ...


  0%|          | 0/96 [00:00<?, ?it/s]

test acc ->  0.6237848781433704
test f1 ->  0.23767835128253081


In [None]:
loadded = torch.load("./model.pth")

In [None]:
model.load_state_dict(loadded["model_state_dict"])
optimezer.load_state_dict(loadded['optimezer_state_dict'])
scheduler.load_state_dict(loadded["scheduler_state_dict"])
loss.load_state_dict(loadded["loss"])
start = loadded["epoch"]