### **Aspect Based Sentiment Analysis**

In [118]:
''' Load all import Library and Framework '''
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import os
import re
import pytorch_lightning
from torch.utils.data import DataLoader

### **Load Dataset by use Pandas**

In [119]:
# Dataset path
Root_dir = '/Users/mahadiur/Desktop/Bongodev MLops Projects/Aspect Based Sentiment Analysis/Data'
test_dir = os.path.join(Root_dir, 'test.csv')
train_dir = os.path.join(Root_dir, 'train.csv')

# Load dataset
Test_Dataset = pd.read_csv(test_dir)
Train_Dataset = pd.read_csv(train_dir)

# Check Dataset
print(Test_Dataset.head())
print(Train_Dataset.head())

                                              review          aspect sentiment
0                   The bread is top notch as well .           bread  positive
1  I have to say they have one of the fastest del...  delivery times  positive
2      Food is always fresh and hot - ready to eat !            Food  positive
3     Did I mention that the coffee is OUTSTANDING ?          coffee  positive
4  Certainly not the best sushi in New York , how...           place  positive
                                              review   aspect sentiment
0              But the staff was so horrible to us .    staff  negative
1  To be completely fair , the only redeeming fac...     food  positive
2  The food is uniformly exceptional , with a ver...     food  positive
3  The food is uniformly exceptional , with a ver...  kitchen  positive
4  The food is uniformly exceptional , with a ver...     menu   neutral


In [120]:
Train_Dataset.columns

Index(['review', 'aspect', 'sentiment'], dtype='object')

### **Data (ABSA Part 1)**

In [121]:
''' Ready single example '''
index = 0
text = Train_Dataset.iloc[index]
review = text['review']
review = review.lower()
review = ' '.join(review.split())
print(review)


but the staff was so horrible to us .


In [122]:
''' Normalize Function '''
def Normalize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = ' '.join(text.split())
    return text

Normalize(review)

'but the staff was so horrible to us'

In [123]:
''' Word-level Tokenization '''
def Tokenization(text):
    text = text.split()
    return text

text = Normalize(review)
Tokenization(text)

['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us']

In [124]:
''' Vocabulary '''
def Vocabulary(texts):
    token_id = {
        '<padding>': 0,
        '<unknown>': 1
    }
    idx = 2
    for text in texts :
        text = Normalize(text)
        text = Tokenization(text)
        for token in text:
            if token_id.get(token) is None :
                token_id[token] = idx
                idx += 1
    return token_id

Token_2_id= Vocabulary(Train_Dataset['review'])
print(len(Token_2_id))


3736


In [125]:
''' Convert token to id '''
def convert_token_2_id(tokens):
    input_ids = [
        Token_2_id.get(token, Token_2_id['<unknown>']) for token in tokens
    ]
    return input_ids

idx = 0
text = Train_Dataset.iloc[idx]
review = text['review'] + 'hello'
normalize = Normalize(review)
Tokenize = Tokenization(normalize)
input_id = convert_token_2_id(Tokenize)

print(len(Token_2_id))
print(Tokenize)
print(input_id)

3736
['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', 'hello']
[2, 3, 4, 5, 6, 7, 8, 9, 1]


In [126]:
''' Convert All sentiment text to numeric '''
Label = {
    "positive": 0,
    'neutral': 1,
    'negative': 2
}

In [127]:
idx = 0
text = Train_Dataset.iloc[idx]
pair_of_text = text['review'] + " " + text['sentiment']
normalize = Normalize(pair_of_text)
Tokenize = Tokenization(normalize)
print(Tokenize)

['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', 'negative']


In [128]:
''' Dataset class '''
class ABSA_Dataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, texts):
        text = Train_Dataset.iloc[texts]
        review = text['review']
        sentiment = text['sentiment']
        aspect = text['aspect']
        pair_of_text = review + ' ' + aspect
        normalize = Normalize(pair_of_text)
        tokenize = Tokenization(normalize)
        input_ids = convert_token_2_id(tokenize)
        label = Label[sentiment]

        return {
            'input_ids': input_ids,
            'label': label,
        }

    @staticmethod
    def collate_func(batch):
        # Batch wise load
        batch_input_ids = [item['input_ids'] for item in batch]
        batch_label = [item['label'] for item in batch]
        # find max length input ids
        max_len = max(len(input_ids) for input_ids in batch_input_ids)
        # padding
        pad_token_id = Token_2_id['<padding>']
        # Same length every example
        batch_padding_input_ids = [
            input_ids + [pad_token_id] * (max_len - len(input_ids)) for input_ids in batch_input_ids
        ]

        return {
            'batch_input_ids': torch.tensor(batch_padding_input_ids, dtype=torch.long),
            'batch_label': torch.tensor(batch_label, dtype=torch.long)
        }


In [129]:
Training = ABSA_Dataset(Train_Dataset)
Training.__getitem__(0)

{'input_ids': [2, 3, 4, 5, 6, 7, 8, 9, 4], 'label': 2}

In [130]:
''' ABSA DataModule '''

class ABSA_Datamodule(pytorch_lightning.LightningDataModule):
    def __init__(self, test_path, train_path, batch_size):
        super().__init__()
        # Dataset path
        self.test_path = test_path
        self.train_path = train_path
        self.batch_size = batch_size

    def setup(self, stage=None):
        # Load Dataset by path
        train_dataset = pd.read_csv(self.train_path)
        test_dataset = pd.read_csv(self.test_path)

        # build vocabulary
        self.Vocabulary = convert_token_2_id(train_dataset['review'])

        # return dataset
        self.train_set = ABSA_Dataset(train_dataset)
        self.test_set = ABSA_Dataset(test_dataset)

    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=ABSA_Dataset.collate_func
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=ABSA_Dataset.collate_func
        )



In [131]:
Module = ABSA_Datamodule(
    test_path=test_dir,
    train_path=train_dir,
    batch_size=32
)
Module.setup()