In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import numpy as np
from libnlp import preprocessing
import pandas as pd
from torch.utils.data import DataLoader,Dataset


In [2]:

if '../' not in sys.path:
    sys.path.append('../')

class Env:
    def __init__(self,tokenizer_model='bert-base-multilingual-uncased',max_len=120,seed=42):
        
        #Defining the paths
        self.data_path='../Data'
        self.train_path='Train.csv'
        self.test_path='Test.csv'
        self.ppd_path='../ppd'
        
        #Definining the tokenizer
        self.tokenizer=torch.hub.load('huggingface/pytorch-transformers', 'tokenizer',tokenizer_model )
        self.model=torch.hub.load('huggingface/pytorch-transformers', 'model',tokenizer_model )
        self.tokenizer_max_len=max_len
        self.cls_token=self.tokenizer.cls_token_id
        self.sep_token=self.tokenizer.sep_token_id
        self.pad_token=self.tokenizer.pad_token_id
        self.unk_token=self.tokenizer.unk_token_id
        
        #Setting the Seed Value
        if torch.cuda.is_available():
            self.device='cuda'
        else:
            self.device='cpu'
        torch.manual_seed(seed)
        os.environ['PYTHONHASHSEED']=str(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic=True
        torch.backends.cudnn.benchmark=False
E=Env()

Using cache found in C:\Users\Jamiu Afolabi/.cache\torch\hub\huggingface_pytorch-transformers_master
Using cache found in C:\Users\Jamiu Afolabi/.cache\torch\hub\huggingface_pytorch-transformers_master


# Create a custom Dataset

In [3]:
class ArabiziDataset(Dataset):
    def __init__(self,df,tokenizer,max_len,train=True):
        self.train=train
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=df.text.values
        self.labels=df.label.values
        
    def __len__(self):
        return len(self.text)  
    
    def __getitem__(self,idx):
        token,mask,len_token=self.token_mask(self.text[idx],self.max_len)
        if self.train:
            label=self.labels[idx]
            return token,mask,len_token,label
        return token,mask,None
    def token_mask(self,text,max_len):
        if max_len in range(511,513):
            len_text=min(max_len-2,len(text))
        else:
            len_text=min(max_len,len(text))
        text=text[:len_text]
        token=self.custTokenizer(self.tokenizer,text)
        len_token=len(token)
        mask= [1] * len_token
        return token,mask,len_token
    
    def custTokenizer(self,tokenizer,text):
        return tokenizer.encode(text)

# Create a Custom Padding Function

In [4]:
def customPadding(batch,tokenizer=E.tokenizer):
    comp=list(zip(*batch))
    tokens=comp[0]
    masks=comp[1]
    len_tokens=comp[2]
    labels=comp[3]
    max_len=max(len_tokens)
    tokens_ret=[]
    masks_ret=[]
    
    for idx in range(len(tokens)):
        pad_len=max_len-min(len_tokens[idx],max_len)
        padding=[tokenizer.pad_token_id] * pad_len
        token=tokens[idx] + padding
        mask=masks[idx] + [0] * pad_len
        tokens_ret.append(token)
        masks_ret.append(mask)
        
    if len(comp)==4:
        return torch.tensor(tokens_ret),torch.tensor(masks_ret),torch.tensor(labels)
    return torch.tensor(tokens_ret),torch.tensor(masks_ret)

    


In [5]:
E.tokenizer.pad_token_id

0

In [6]:
train_df=pd.read_csv(os.path.join(E.ppd_path,E.train_path),encoding='utf-8')

In [7]:
train_ds=ArabiziDataset(train_df,E.tokenizer,E.tokenizer_max_len)
train_loader=DataLoader(train_ds,batch_size=10,collate_fn=customPadding)

In [8]:
dataiter=iter(train_loader)
a,b,c=dataiter.next()
e,f,g=dataiter.next()

In [9]:
e


tensor([[  101, 32332, 10112, 17675, 11301, 13934,   165, 32332, 10112, 10238,
         13871, 64265, 89882, 11055, 10243, 10243, 11435, 13228,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101, 10816, 15390, 10900, 13871, 10277, 37521, 10537, 50945, 10408,
         15643, 13533, 12364, 15235, 10102, 12854, 14382, 10421, 10481, 12620,
         28230,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101, 59878, 25403, 10167, 11162, 49715, 10165, 39407, 10911, 10863,
         42912, 85904, 10159, 28204, 83863, 11638, 33114, 26658, 40914, 13216,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     

# Create a Model

In [10]:
class BertModel(nn.Module):
    def __init__(self,n_outputs,bert_model=E.model):
        super(BertModel,self).__init__()
        self.K=n_outputs
        self.bert_model=E.model
        self.bert_hidden_size=E.model.config.hidden_size
        
        self.conv1=nn.Conv1d(self.bert_hidden_size,32,3,padding=1)
        self.pool1=nn.MaxPool1d(2)
        self.conv2=nn.Conv1D(32,64,4,padding=1)
        self.pool2=nn.MaxPool1d(2)
        self.conv3=nn.Conv1D(63,128,5,padding=1)
        self.pool3=nn.MaxPool1d(2)
        self.conv3=nn.Conv1d(126,256,3,padding=1)
        
        self.fc1=nn.Linear(256,128)
        self.fc2=nn.Linear(128,self.K)
        
    def forward(self,X):
        out=self.bert_model(X)
        out=out.permute(0,2,1)
        out=
        

SyntaxError: invalid syntax (<ipython-input-10-cd1925d5b88b>, line 4)