In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
from collections import Counter

from tqdm import tqdm
import torch
from torch import nn
import torch.optim as optim
from torchtext import data
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as transforms
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import spacy

spacy_eng = spacy.load("en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  warn(f"Failed to load image Python extension: {e}")


In [2]:
df_train = pd.read_parquet("E:\\tamil statement analysis\\tamilmixsentiment-train.parquet")
df_train.drop(df_train[df_train['label']>1].index,inplace=True)
df_train

Unnamed: 0,text,label
0,Trailer late ah parthavanga like podunga,0
1,Move pathutu vanthu trailer pakurvnga yaru,0
2,Puthupetai dhanush ah yarellam pathinga,0
3,"Dhanush oda character ,puthu sa erukay , mass ta",0
4,vera level ippa pesungada mokka nu thalaivaaaaaa,0
...,...,...
11328,Shankar Anna and AR sir Sema mass,0
11329,intha movie la yuvan music therikum pola thonu...,0
11330,Yuvan shankar Raja anna fan's like here...,0
11331,A masterpiece best revenge film I’ve ever scene,0


In [87]:
df_test = pd.read_parquet("E:\\tamil statement analysis\\tamilmixsentiment-test.parquet")
df_test.drop(df_test[df_test['label']>1].index,inplace=True)
df_test =df_test.reset_index(drop=True)
df_test

Unnamed: 0,text,label
0,Yarayellam FDFS ppga ippove ready agitinga,0
1,Ennada viswasam mersal sarkar madhri time la l...,0
2,yuvan vera level ya .... valuable script. SK i...,0
3,all the best anna...Telugu makkal selvan fans,0
4,1:17 verithanama iruku nu solravanga like podunga,0
...,...,...
2494,2k likes for 700k Share max,0
2495,Tamil krish ah irukum oh...,0
2496,Thalaivaaaaaa... trailer ye pattaiya kelapudhe...,0
2497,1:05 to 1:30 Vere level masss,0


# Preprocessing

In [4]:
from sklearn.utils import resample
df_majority = df_train[df_train['label']==0]
df_remaining = df_train[df_train['label']!=0]

df_down = resample(df_majority,replace=False,n_samples=2000)
df_train = pd.concat([df_down,df_remaining])
df_train =df_train.reset_index(drop=True)

In [38]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens():
    for i in df_train['text']:
        token = tokenizer(i)
        yield token
text_generator = yield_tokens()

UNK_IDX, PAD_IDX,SOS_IDX, EOS_IDX = 0, 1, 2, 3     
vocab = build_vocab_from_iterator(text_generator, specials=['<unk>', '<pad>', '<sos>', '<eos>'],special_first=True)
vocab.set_default_index(UNK_IDX)
len(vocab.get_stoi())



8879

In [23]:
def data_pipline(df):
    data = []
    for i,text in enumerate(df['text']):
        tokens = tokenizer(text)
        data.append((torch.tensor([vocab[token] for token in tokens],dtype=torch.long),int(df['label'][i])))
        
    return data 
   
train_data = data_pipline(df_train)
test_data = data_pipline(df_test)

def collate_batch(batch):
    text_list,label_list = zip(*batch)
    text_length = [len(text) for text in text_list ]
    padded_text_list = pad_sequence(text_list, padding_value=vocab['<pad>'], batch_first=True).transpose(0, 1)
    return torch.tensor(label_list,dtype=torch.float).to(device),padded_text_list.to(device)


batch_size = 4
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,collate_fn=collate_batch)
test_loader = DataLoader(test_data,shuffle=False,collate_fn=collate_batch)

# Model

In [7]:
class Textclassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Textclassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        self.fc1 = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        
        
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedding)
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        hidden = self.dropout(hidden)
        out = self.fc1(hidden)
        
        return out

# Train and Evaluation 

In [8]:
model = Textclassifier(len(vocab),100,259).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)

def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

def eval(model,loader,crition):
    avg_acc=[]
    model.eval()
    with torch.no_grad(): 
        for batch in loader:
            label,text= batch 
            pred=model(text)
            acc = binary_acc(pred,label).item()
            avg_acc.append(acc)
            
    print("Test_acc:",np.array(avg_acc).mean())

In [None]:
avg_acc=[]
model.train()
for epoch in range(10):
    train_loss = 0.0
    for i,batch in tqdm(enumerate(train_loader),total=len(train_loader)):
        label,text=batch
        pred = model(text).squeeze(1)
        loss = criteon(pred,label)
        acc = binary_acc(pred,label).item()
        avg_acc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss +=loss.item()*text.size(0)
    print("loss",train_loss/len(train_loader.sampler))
    print('train acc:',np.array(avg_acc).mean())
    eval(model,test_loader,criteon)
    

# Examples

In [84]:
texts = ['"Marana mass karthi broo ninga vera level"',"Haiyooo.. hero roll ku indhe munji sariye waradhu.",'Padu mokkai, ean thalayai kooni kondu nikkuthu thalai',
         "mokka ya tha iruku trailer antha level ku perusa onnum illa","Vadachennai ku aprom oru come back pa"]
for text in texts:
    d =[]
    for i in text.split(' '):
        d.append(vocab[i])
    pre = model(torch.tensor(d).view(-1,1))
    pre = torch.round(torch.sigmoid(pre))
    print(f'{text}, {pre.item()} ')

"Marana mass karthi broo ninga vera level", 0.0 
Haiyooo.. hero roll ku indhe munji sariye waradhu., 1.0 
Padu mokkai, ean thalayai kooni kondu nikkuthu thalai, 1.0 
mokka ya tha iruku trailer antha level ku perusa onnum illa, 1.0 
Vadachennai ku aprom oru come back pa, 0.0 
