In [1]:
import sys
import os
from libnlp import preprocessing
import pandas as pd
import torchtext.data as ttd
import torch

if '../' not in sys.path:
    sys.path.append('../')

class Env:
    def __init__(self,tokenizer_model='bert-base-multilingual-uncased',max_len=120,seed=42):
        
        #Defining the paths
        self.data_path='../Data'
        self.train_path='Train.csv'
        self.test_path='Test.csv'
        self.ppd_path='../ppd'
        
        #Definining the tokenizer
        self.tokenizer=torch.hub.load('huggingface/pytorch-transformers', 'tokenizer',tokenizer_model )
        self.tokenizer_max_len=max_len
        self.cls_token=self.tokenizer.cls_token_id
        self.sep_token=self.tokenizer.sep_token_id
        self.pad_token=self.tokenizer.pad_token_id
        self.unk_token=self.tokenizer.unk_token_id
        
        #Setting the Seed Value
        if torch.cuda.is_available():
            self.device='cuda'
        else:
            self.device='cpu'
        torch.manual_seed(seed)
        os.environ['PYTHONHASHSEED']=str(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic=True
        torch.backends.cudnn.benchmark=False
E=Env()

Using cache found in C:\Users\Jamiu Afolabi/.cache\torch\hub\huggingface_pytorch-transformers_master


# Preprocessing the Text Files

In [2]:
def check_dir(path):
    if not os.path.exists(path):
        os.mkdir(path)
        
def preprocessed_df(data_path,csv_p,ppd_p,train=True):
    p_csv=os.path.join(data_path,csv_p)
    df=pd.read_csv(p_csv,encoding='utf-8')
    df['text']=df.text.apply(lambda x: preprocessing.preprocess(x))
    if train:
        ls1=set(df.label)
        # {0: 0, 1: 1, -1: 2}
        df['label']=df.label.map(dict([(key,val) for val,key in enumerate(ls1)]))
    df.drop(['ID'],axis=1,inplace=True)
    f_name=csv_p.split('.')[0]
    check_dir(ppd_p)
    df.to_csv(f'{ppd_p}/{f_name}.csv',index=False)
    
preprocessed_df(E.data_path,E.train_path,E.ppd_path)
preprocessed_df(E.data_path,E.test_path,E.ppd_path,train=False)

# Preparing the Dataset for Torch

In [3]:
E.tokenizer_max_len

120

In [4]:
def tokenize_max(tweet):
    
    if len(tweet)>E.tokenizer_max_len-2:
        #print('greater')
        tweet=tweet[:E.tokenizer_max_len-2]
        #print('in token')
    tokens=E.tokenizer.tokenize(tweet,add_special_tokens=False,padding=True)
    #print(tokens)
    return tokens

class DataPrep:
    def __init__(self):
    #creating the field object
        self.TEXT=ttd.Field(batch_first=True,
                            use_vocab=False,
                            pad_first=True,
                            tokenize=tokenize_max,
                            preprocessing=E.tokenizer.convert_tokens_to_ids,
                            init_token=E.cls_token,
                            eos_token=E.sep_token,
                            pad_token=E.pad_token,
                            unk_token=E.unk_token,
                            )
        
        self.LABEL=ttd.Field(sequential=False,
                             is_target=True,
                             use_vocab=False) 
        self.dataset=ttd.TabularDataset(path='../ppd/Train.csv',
                                        format='csv',
                                        skip_header=True,
                                        fields=[('text',self.TEXT),('label',self.LABEL)])
        
        self.train_data,self.test_data=self.dataset.split()
        self.train_iter,self.test_iter=ttd.Iterator.splits((self.train_data,self.test_data),
                                                           sort_key=lambda x: len(x.text),
                                                           batch_sizes=(32,32),
                                                           device=E.device
                                                          )


In [5]:
D=DataPrep()

In [6]:
i=0
for inputs,target in D.test_iter:
    i+=1
    print(inputs.shape)
    print(target.shape)
    if i==20:
        break

torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 4])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])
torch.Size([32, 5])
torch.Size([32])


# Building the Model

In [7]:
import torch.nn as nn
import torch.nn.functional as F
class SimpleRNN(nn.Module):
    def __init__(self,n_vocabs,n_embedding,n_hidden,n_layer,n_output):
        super(SimpleRNN,self).__init__()
        self.V=n_vocabs
        self.D=n_embedding
        self.M=n_hidden
        self.L=n_layer
        self.K=n_output
        
        self.embed=nn.Embedding(self.V,self.D)
        self.rnn=nn.LSTM(input_size=self.D,
                         hidden_size=self.M,
                         num_layers=self.L,
                         batch_first=True,
                         bidirectional=False
                        )
        self.fc=nn.Linear(self.M,1024)
        self.fc2=nn.Linear(1024,128)
        self.fc3=nn.Linear(128,self.K)
        
        
    def forward(self,X):
        bs=X.size(0)
        #Defining parameters for rnn instantiation
        h0=torch.zeros(self.L,X.size(0),self.M).to(E.device)
        c0=torch.zeros(self.L,X.size(0),self.M).to(E.device)
        
        out=self.embed(X)
        out,_=self.rnn(out,(h0,c0))
        out,_=torch.max(out,1)
        out=out.view(bs,-1)
        out=self.fc(out)
        out=F.relu(out)
        out=self.fc2(out)
        out=F.relu(out)
        out=self.fc3
        out
        return out
        

In [8]:
params={'n_vocabs':E.tokenizer.vocab_size,
        'n_embedding':64,
        'n_hidden':50,
        'n_layer':2,
        'n_output':3
       }
model=SimpleRNN(**params)
model.to(E.device)

SimpleRNN(
  (embed): Embedding(105879, 64)
  (rnn): LSTM(64, 50, num_layers=2, batch_first=True)
  (fc): Linear(in_features=50, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
)

In [9]:
from datetime import datetime
import numpy as np


# Defining Loss and optimizer
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters())

n_epochs=10
train_losses=[]
test_losses=[]

for epoch in range(n_epochs):
    t0=datetime.now()
    train_loss=[]
    for inputs,targets in D.train_iter:
        #targets=targets.view(-1,1)
        
        #zero grad
        optimizer.zero_grad()
        #forward
        out=model(inputs)
        loss=criterion(out,targets)
        #back
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        
    train_loss=np.mean(train_loss)
    
    test_loss=[]
    
    for inputs,targets in D.test_iter:
        
        #targets=targets.view(-1,1).float()
        out=model(inputs)
        loss=criterion(out,targets)
        test_loss.append(loss.item())
    test_loss=np.mean(test_loss)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    t1=datetime.now()
    duration=t1-t0
    print(f'{epoch+1}/{n_epochs}, train_loss: {train_loss}, test_loss: {test_loss}, duration: {duration}')

ModuleAttributeError: 'Linear' object has no attribute 'log_softmax'

In [None]:
model