In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
from collections import Counter

from tqdm import tqdm
import torch
from torch import nn
import torch.optim as optim
from torchtext import data
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as transforms
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import spacy

spacy_eng = spacy.load("en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  warn(f"Failed to load image Python extension: {e}")


In [34]:
df_train = pd.read_parquet("E:\\tamil statement analysis\\tamilmixsentiment-train.parquet")
df_train

Unnamed: 0,text,label
0,Trailer late ah parthavanga like podunga,0
1,Move pathutu vanthu trailer pakurvnga yaru,0
2,Puthupetai dhanush ah yarellam pathinga,0
3,"Dhanush oda character ,puthu sa erukay , mass ta",0
4,vera level ippa pesungada mokka nu thalaivaaaaaa,0
...,...,...
11330,Yuvan shankar Raja anna fan's like here...,0
11331,A masterpiece best revenge film I’ve ever scene,0
11332,Enna pa thala ya kamiya than katringa,0
11333,R A A S H I K H A N N A,3


In [3]:
df_test = pd.read_parquet("E:\\tamil statement analysis\\tamilmixsentiment-validation.parquet")
df_test.iloc[:10]

Unnamed: 0,text,label
0,Daily likes & views pakka vanthavaga ellarukum...,0
1,25 k dislikes ethuku da intha trailerku poi ap...,1
2,#Lyca unna nenacha pavama iruku ya,2
3,It looks like Hindi movie amitab bachan,0
4,Thalaivarukku nejamavey vayasaagiduchu... siva...,0
5,THALA nu Sollu THALA nemirinthu nillu,0
6,Pink original version pathavikaluku this trail...,0
7,Vera levellllllllllllll all the best shankar s...,1
8,H Raja reference kandupudichavangalam like pan...,0
9,Massssss thalivaa Vera level energy petta parak,0


In [100]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens():
    for i in df_train['text']:
        token = tokenizer(i)
        yield token
text_generator = yield_tokens()

UNK_IDX, PAD_IDX,SOS_IDX, EOS_IDX = 0, 1, 2, 3     
vocab = build_vocab_from_iterator(text_generator, specials=['<unk>', '<pad>', '<sos>', '<eos>'],special_first=True)
vocab.set_default_index(UNK_IDX)
len(vocab.get_stoi())



19884

In [101]:
def data_pipline(df):
    data = []
    for i,text in enumerate(df['text']):
        tokens = tokenizer(text)
        data.append((torch.tensor([vocab[token] for token in tokens],dtype=torch.long),int(df['label'][i])))
        
    return data 
   
train_data = data_pipline(df_train)

def collate_batch(batch):
    text_list,label_list = zip(*batch)
    text_length = [len(text) for text in text_list ]
    padded_text_list = pad_sequence(text_list, padding_value=vocab['<pad>'], batch_first=True).transpose(0, 1)
    return torch.tensor(label_list,dtype=torch.float).to(device),padded_text_list.to(device), torch.tensor(text_length, dtype=torch.long).to(device)

batch_size = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,collate_fn=collate_batch)

In [95]:
class Textclassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Textclassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedding)
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out

In [96]:
model = Textclassifier(len(vocab),100,259).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)

def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

In [103]:
avg_acc=[]
model.train()
for epoch in range(10):
    for i,batch in tqdm(enumerate(train_loader),total=len(train_loader)):
        label,text,text_length=batch
        pred = model(text).squeeze(1)
        loss = criteon(pred,label)
        acc = binary_acc(pred,label).item()
        avg_acc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("loss",loss.item())
    print('acc',np.array(avg_acc).mean())
    

100%|██████████| 355/355 [00:56<00:00,  6.31it/s]


loss 0.8180947303771973
acc 0.34275653924740535


100%|██████████| 355/355 [00:55<00:00,  6.34it/s]


loss 0.8478361368179321
acc 0.3626571931889359


100%|██████████| 355/355 [01:06<00:00,  5.34it/s]


loss 4.021424770355225
acc 0.3759138162567022


100%|██████████| 355/355 [01:03<00:00,  5.55it/s]


loss 0.6812887191772461
acc 0.3830985915807771


 10%|█         | 37/355 [00:07<01:09,  4.60it/s]