# Deep Learning Model

## Preparing Data

In [1]:
from model import Model,DataSet
from transformers import AutoTokenizer
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torchmetrics import Accuracy
from torchmetrics.classification import BinaryAccuracy
import numpy as np
import torch.nn.functional as F
from torchinfo import summary
import pandas as pd
from sklearn.utils import shuffle
from zemberek import TurkishTokenizer
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import sklearn


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ytu-ce-cosmos/turkish-gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [3]:
df_train = pd.read_csv("/home/musasina/projects/teknofest/msnet/datasets/train_final.csv")
df_train["text"] = df_train["text"].convert_dtypes(convert_string=True)


In [4]:
def preprocess(examples):
    model_inputs = tokenizer(examples, max_length=16,padding="max_length",truncation=True,return_tensors="pt")
    return np.array(model_inputs["input_ids"])[0]

In [5]:
encoded_text = df_train["text"].map(preprocess).to_list()

In [6]:
import string
def convert_to_list(repr:str):
    results = []
    digits = set(string.digits)
    for letter in repr:
        if letter in digits:
            results.append(int(letter))
    return results

In [7]:
df_train["labels"] = df_train["labels"].map(convert_to_list)

In [8]:
labels = np.array(df_train["labels"].to_list())

In [18]:
print((labels==0).sum())
print((labels==1).sum())
print((labels==2).sum())

2287398
90096
65946


In [19]:
2287398 + 90096 + 65946

2443440

In [9]:
labels_encoded = []
for row in labels:
    labels_encoded.append(F.one_hot(torch.from_numpy(row).long(),num_classes=3).numpy())
labels_encoded = np.array(labels_encoded)

In [10]:
train_text = torch.Tensor(np.array(encoded_text))
#factorized = pd.factorize(df_train["label"])
train_label = torch.as_tensor(labels_encoded,dtype=torch.long,device="cuda").type(torch.float32)
train_text = train_text.type(dtype=torch.float32)

In [11]:
model = Model().to("cuda")
summary(model)

Layer (type:depth-idx)                                  Param #
Model                                                   --
├─MFP: 1-1                                              --
│    └─LeakyReLU: 2-1                                   --
│    └─ReLU: 2-2                                        --
│    └─Sigmoid: 2-3                                     --
│    └─Tanh: 2-4                                        --
│    └─Mish: 2-5                                        --
│    └─SiLU: 2-6                                        --
│    └─SELU: 2-7                                        --
│    └─ELU: 2-8                                         --
│    └─GELU: 2-9                                        --
│    └─Softplus: 2-10                                   --
│    └─Linear: 2-11                                     16,448
│    └─Linear: 2-12                                     16,448
│    └─Linear: 2-13                                     16,448
│    └─Linear: 2-14                    

In [12]:
train_dataset = DataSet(train_text,train_label)
#test_dataset = DataSet(test_text,test_label)

In [13]:
train_dataloader = DataLoader(dataset=train_dataset,shuffle=True,batch_size=512)
#test_dataloader = DataLoader(dataset=test_dataset,shuffle=False,batch_size=16)

In [14]:
optimezer = torch.optim.AdamW(model.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimezer,end_factor=1e-5,total_iters=25)
loss = torch.nn.CrossEntropyLoss(weight=[2443440/2287398,2443440/90096,2443440/65946])

In [15]:
tensorboard = SummaryWriter("./log")

In [16]:
for epoch in tqdm(range(0,5)):
    
    for i,(x,y) in enumerate(tqdm(train_dataloader)):
        model.train()
        y = y.to("cuda").float()
        final = model(x)
        losses = loss(final.squeeze(),y.squeeze())
        model.eval()
        with torch.inference_mode():
            tensorboard.add_scalar("loss",losses.cpu().item(),(epoch*len(train_dataloader))+i)
        model.train()
        optimezer.zero_grad()
        losses.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1.0,error_if_nonfinite=True)
        optimezer.step()
    scheduler.step()
    torch.save({"model_state_dict":model.state_dict(),
            "optimezer_state_dict": optimezer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "loss": loss.state_dict(),
            "epoch":epoch},"./model.pth")


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


  0%|          | 0/299 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
text = "turkcell cok iyi firma"

In [None]:
tokens = tokenizer([text], max_length=16,padding="max_length",truncation=True,return_tensors="pt")["input_ids"]

In [None]:
tokens

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0, 13846,  8755,  6420,   810,  4610]])

In [None]:
model(tokens)

tensor([[ 1.5642,  1.3707,  1.1211,  0.5545,  0.1200,  0.0109,  0.0038, -0.0043,
         -0.0109, -0.0064, -0.0129, -0.0088, -0.0072,  0.0070,  0.0022, -0.0077]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
model.eval()
with torch.inference_mode():
    results_acc = []
    print("started testing ...")
    for i,(x,y) in enumerate(tqdm(test_dataloader)):
        y = y.to("cuda")
        results = model(x)
        results_acc.append(acc(results.argmax(dim=1),y).cpu().item())
    print("test -> ", np.mean(results_acc))

started testing ...


NameError: name 'test_dataloader' is not defined

In [None]:
loadded = torch.load("./model.pth")

In [None]:
model.load_state_dict(loadded["model_state_dict"])
optimezer.load_state_dict(loadded['optimezer_state_dict'])
scheduler.load_state_dict(loadded["scheduler_state_dict"])
loss.load_state_dict(loadded["loss"])
start = loadded["epoch"]