In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import numpy as np
from libnlp import preprocessing
import pandas as pd
from torch.utils.data import DataLoader,Dataset
from ignite.utils import convert_tensor
from pytorch_pretrained_bert.modeling import BertModel

In [2]:

if '../' not in sys.path:
    sys.path.append('../')

class Env:
    def __init__(self,tokenizer_model='bert-base-multilingual-uncased',max_len=120,seed=42):
        
        #Defining the paths
        self.data_path='../Data'
        self.train_path='Train.csv'
        self.test_path='Test.csv'
        self.ppd_path='../ppd'
        
        #Definining the tokenizer
        self.tokenizer=torch.hub.load('huggingface/pytorch-transformers', 'tokenizer',tokenizer_model )
        self.model=torch.hub.load('huggingface/pytorch-transformers', 'model',tokenizer_model )
        self.tokenizer_max_len=max_len
        self.cls_token=self.tokenizer.cls_token_id
        self.sep_token=self.tokenizer.sep_token_id
        self.pad_token=self.tokenizer.pad_token_id
        self.unk_token=self.tokenizer.unk_token_id
        
        #Setting the Seed Value
        if torch.cuda.is_available():
            self.device='cuda'
        else:
            self.device='cpu'
        torch.manual_seed(seed)
        os.environ['PYTHONHASHSEED']=str(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic=True
        torch.backends.cudnn.benchmark=False
E=Env()

Using cache found in C:\Users\Jamiu Afolabi/.cache\torch\hub\huggingface_pytorch-transformers_master
Using cache found in C:\Users\Jamiu Afolabi/.cache\torch\hub\huggingface_pytorch-transformers_master


In [None]:
E.model(return_dict=False)

# Create a custom Dataset

In [3]:
class ArabiziDataset(Dataset):
    def __init__(self,df,tokenizer,max_len,train=True):
        self.train=train
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=df.text.values
        self.labels=df.label.values
        
    def __len__(self):
        return len(self.text)  
    
    def __getitem__(self,idx):
        token,mask,len_token=self.token_mask(self.text[idx],self.max_len)
        if self.train:
            label=self.labels[idx]
            return token,mask,len_token,label
        return token,mask,None
    def token_mask(self,text,max_len):
        if max_len in range(511,513):
            len_text=min(max_len-2,len(text))
        else:
            len_text=min(max_len,len(text))
        text=text[:len_text]
        token=self.custTokenizer(self.tokenizer,text)
        len_token=len(token)
        mask= [1] * len_token
        return token,mask,len_token
    
    def custTokenizer(self,tokenizer,text):
        return tokenizer.encode(text)

# Create a Custom Padding Function

In [33]:
def customPadding(batch,tokenizer=E.tokenizer):
    comp=list(zip(*batch))
    tokens=comp[0]
    masks=comp[1]
    len_tokens=comp[2]
    labels=comp[3]
    max_len=max(len_tokens)
    tokens_ret=[]
    masks_ret=[]
    
    for idx in range(len(tokens)):
        pad_len=max_len-min(len_tokens[idx],max_len)
        padding=[tokenizer.pad_token_id] * pad_len
        token=tokens[idx] + padding
        mask=masks[idx] + [0] * pad_len
        tokens_ret.append(token)
        masks_ret.append(mask)
        
    if len(comp)==4:
        return [torch.tensor(tokens_ret),torch.tensor(masks_ret)],torch.tensor(labels)
    return torch.tensor(tokens_ret),torch.tensor(masks_ret)

    


In [34]:
E.tokenizer.pad_token_id

0

In [35]:
train_df=pd.read_csv(os.path.join(E.ppd_path,E.train_path),encoding='utf-8')

In [46]:
from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import SubsetRandomSampler

ds=ArabiziDataset(train_df,E.tokenizer,E.tokenizer_max_len)
train_idx,test_idx=train_test_split(list(range(len(ds))),test_size=0.25)
train_s,test_s=SubsetRandomSampler(train_idx),SubsetRandomSampler(test_idx)
train_loader=DataLoader(train_ds,batch_size=10,collate_fn=customPadding,sampler=train_s)
test_loader=DataLoader(train_ds,batch_size=10,collate_fn=customPadding,sampler=test_s)

In [38]:
dataiter=iter(train_loader)
a,b=dataiter.next()
e,f=dataiter.next()

In [39]:
e


[tensor([[  101,   124, 39359, 36671, 18320, 10167, 10354, 60581,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101, 10193, 13685,   162, 11444, 20651,   162, 10959, 14461, 10209,
          10650, 84147, 10243, 15041, 10546, 10959, 61581, 27320, 53777,   152,
          10351, 10163, 28056, 10159, 11518, 10112, 10117, 15167, 10115, 71224,
          10959, 10745, 37547, 10112, 17509, 12981, 12438, 17368, 31588, 11378,
            102,     0,     0,     0],
         [  101, 25410, 34238, 10380, 10593, 23550, 19601, 10116, 10593, 27102,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     

# Create a Model

In [103]:
class BertModel(nn.Module):
    def __init__(self,n_outputs,bert_model=E.model):
        super(BertModel,self).__init__()
        self.K=n_outputs
        self.bert_model=E.model
        self.bert_hidden_size=E.model.config.hidden_size
        
        self.conv1=nn.Conv1d(self.bert_hidden_size,32,3,padding=1)
        self.pool1=nn.MaxPool1d(2)
        self.conv2=nn.Conv1d(32,64,3,padding=1)
        self.pool2=nn.MaxPool1d(2)
        self.conv3=nn.Conv1d(64,128,3,padding=1)
        self.pool3=nn.MaxPool1d(2)
        self.conv4=nn.Conv1d(128,256,3,padding=1)
        
        self.fc1=nn.Linear(256,128)
        self.fc2=nn.Linear(128,self.K)
        
    def forward(self,X):
        out,_=self.bert_model(input_ids=X[0],attention_mask=X[1],return_dict=False)
        out=out.permute(0,2,1)
        out=self.conv1(out)
        out=F.relu(out)
        out=self.pool1(out)
        out=self.conv2(out)
        out=F.relu(out)
        out=self.pool2(out)
        out=self.conv3(out)
        out=F.relu(out)
        out=self.pool3(out)
        out=self.conv4(out)
        out=F.relu(out)
        
        out=out.permute(0,2,1)
        out,_=torch.max(out,1)
        out=self.fc1(out)
        out=F.relu(out)
        out=self.fc2(out)
        
        return out
        

In [104]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

            
def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [105]:
model=BertModel(3)

In [106]:
model.to(E.device)
set_trainable(model.bert_model, False)


In [None]:
from datetime import datetime
import numpy as np


# Defining Loss and optimizer
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters())

n_epochs=10
train_losses=[]
test_losses=[]

for epoch in range(n_epochs):
    t0=datetime.now()
    train_loss=[]
    for inputs,targets in train_loader:
        inputs,targets=convert_tensor(inputs,device=E.device),convert_tensor(targets,device=E.device)
        #targets=targets.view(-1,1)
        
        #zero grad
        optimizer.zero_grad()
        #forward
        out=model(inputs)
        loss=criterion(out,targets)
        #back
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        
    train_loss=np.mean(train_loss)
    
    test_loss=[]
    
    for inputs,targets in test_loader:
        
        #targets=targets.view(-1,1).float()
        inputs,targets=convert_tensor(inputs,device=E.device),convert_tensor(targets,device=E.device)
        out=model(inputs)
        loss=criterion(out,targets)
        test_loss.append(loss.item())
    test_loss=np.mean(test_loss)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    t1=datetime.now()
    duration=t1-t0
    print(f'{epoch+1}/{n_epochs}, train_loss: {train_loss}, test_loss: {test_loss}, duration: {duration}')

1/10, train_loss: 0.6363844664934135, test_loss: 0.5904938032541956, duration: 0:05:40.430662
2/10, train_loss: 0.5654653862814109, test_loss: 0.5654670892357826, duration: 0:05:40.626023
