# **Aspect Based Sentiment Analysis**

In [50]:
''' Import all important Libraries '''
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# **Load Data**

In [51]:
# Dataset path
Train_path ='/Users/mahadiur/Desktop/Bongodev MLops Projects/Aspect Based Sentiment Analysis/Data/train.csv'
Test_path ='/Users/mahadiur/Desktop/Bongodev MLops Projects/Aspect Based Sentiment Analysis/Data/test.csv'

# Load by use pandas
Train_dataset = pd.read_csv(Train_path)
Test_dataset = pd.read_csv(Test_path)

# check data
print(Train_dataset.head())


                                              review   aspect sentiment
0              But the staff was so horrible to us .    staff  negative
1  To be completely fair , the only redeeming fac...     food  positive
2  The food is uniformly exceptional , with a ver...     food  positive
3  The food is uniformly exceptional , with a ver...  kitchen  positive
4  The food is uniformly exceptional , with a ver...     menu   neutral


In [52]:
''' Check aspect values in training dataset '''
Train_dataset.value_counts(['aspect'])

aspect                     
food                           340
service                        189
place                           59
prices                          56
menu                            56
                              ... 
chicken tikka marsala            1
chicken tikka                    1
chicken on rice with ginger      1
chicken in curry sauce           1
zucchini blossoms                1
Name: count, Length: 1322, dtype: int64

In [53]:
''' Check sentiment values in training dataset '''
Train_dataset.value_counts(['sentiment'])

sentiment
positive     2164
negative      807
neutral       637
Name: count, dtype: int64

In [54]:
''' Dataset information '''
Train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3608 entries, 0 to 3607
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     3608 non-null   object
 1   aspect     3608 non-null   object
 2   sentiment  3608 non-null   object
dtypes: object(3)
memory usage: 84.7+ KB


In [55]:
''' Normalize single column '''
import re
index = 0
example = Train_dataset.iloc[index]
review = example['review']
sentiment = example['sentiment']
aspect = example['aspect']
review = review.lower()
review = re.sub(r'[^a-z0-9\s]', '', review)
review = ' '.join(review.split())
review

'but the staff was so horrible to us'

In [56]:
''' Define Normalization functions '''
def Normalizetion(text):
    # Convert Lowercase
    text = text.lower()
    # Remove all Punctuation and extra white space
    text = re.sub(r'[^a-z0-0\s]','', text)
    # split and join
    text = " ".join(text.split())
    return text

text = example['review']
review = Normalizetion(text)
review

'but the staff was so horrible to us'

In [57]:
''' Define Tokenization Function '''
def Tokenizer(text):
    # Word level Tokenization
    text = text.split()
    return text

text = Tokenizer(review)
text

['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us']

In [58]:
''' Define Vocabulary Function '''
def Vocabulary_func(texts):
    text_2_id = {
        'padding':0,
        'unknown':1
    }
    idx = 2
    for text in texts:
        normalize = Normalizetion(text)
        Tokens = Tokenizer(normalize)

        for token in Tokens:
            if text_2_id.get(token) is None:
                text_2_id[token] = idx
                idx += 1

    return text_2_id

vocabulary = Vocabulary_func(Train_dataset['review'])
print(f'Vocabulary Length: {len(vocabulary)}')

Vocabulary Length: 3732


In [60]:
''' Convert token to id '''
def token_2_id(tokens):
    input_id = [
        vocabulary.get(token, vocabulary['unknown']) for token in tokens
    ]
    return input_id

text = example['review'] + 'hello'
text = Normalizetion(text)
tokens = Tokenizer(text)
text = token_2_id(tokens)
print(tokens)
print(text)

['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', 'hello']
[2, 3, 4, 5, 6, 7, 8, 9, 1]


In [64]:
''' all conditions '''
labels = {
    'positive':0,
    'neutral':1,
    'negative':2,
}

In [65]:
''' Process dataset  '''
def __getitem__(idx):
    text = Train_dataset.iloc[idx]
    review = text['review']
    sentiment = text['sentiment']
    aspect = text['aspect']
    concat = review + ' ' + aspect
    normalize = Normalizetion(concat)
    token = Tokenizer(normalize)
    token_id = token_2_id(token)
    label_id = labels[sentiment]
    return{
        "Token_id":token_id,
        "label_id":label_id,
    }

process_example = __getitem__(idx=0)
process_example

{'Token_id': [2, 3, 4, 5, 6, 7, 8, 9, 4], 'label_id': 2}