# Libraries

In [26]:
import pandas as pd

import torch
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

import re
import html
# import emoji
import os

# Configurations

In [84]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128  # good choice as max length was 67 in data overview
DATA_DIR = "../Data"
OUTPUT_FILE = os.path.abspath(os.path.join(DATA_DIR, "mams_processed_bert"))

# Load Data

In [67]:
df = pd.read_csv(os.path.join(DATA_DIR, 'mams_train_parsed.csv'))
df_val = pd.read_csv(os.path.join(DATA_DIR, 'mams_val_parsed.csv'))

# Preprocess

## remove nulls

In [68]:
df = df.dropna()
df_val = df_val.dropna()

In [69]:
print(f'Shape of df train: {df.shape}')
print(f'Shape of df val: {df_val.shape}')

Shape of df train: (18275, 3)
Shape of df val: (2220, 3)


## remove duplicates

In [70]:
df.drop_duplicates(inplace=True)
df.drop_duplicates(inplace=True)

In [71]:
print(f'Shape of df train: {df.shape}')
print(f'Shape of df val: {df_val.shape}')

Shape of df train: (17045, 3)
Shape of df val: (2220, 3)


# Label Mapping

In [72]:
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}

In [73]:
def map_labels(df):
    if df['label'].dtype == 'object':
        df['label'] = df['label'].str.lower().map(label_map)
    else: raise print('data type is not object')
    return df

In [74]:
df = map_labels(df)
df_val = map_labels(df_val)

In [75]:
df.head()

Unnamed: 0,text,aspect,label
0,It might be the best sit down food I've had in...,food,2
1,It might be the best sit down food I've had in...,place,1
2,Hostess was extremely accommodating when we ar...,staff,2
3,Hostess was extremely accommodating when we ar...,miscellaneous,1
4,We were a couple of minutes late for our reser...,miscellaneous,1


# Convert to Hugging Face Dataset format

In [76]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df),
    "validation": Dataset.from_pandas(df_val)
})

# Cleaning

In [77]:
def clean_text(text):
    # decode html
    text = html.unescape(str(text))
    # remove urls and user handles
    text = re.sub(r"http\S+|www\S+|https\S+", "[URL]", text)
    text = re.sub(r'@\w+', '[USER]', text)
    # Demojize
    #text = emoji.demojize(text, delimiters=(" ", " "))
    # remove excessive whitespace
    text = re.sub(r'\s', ' ', text).strip()
    
    return text

# Tokenization

In [78]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [79]:
def preprocess_function(examples):
    cleaned_batch_texts = [clean_text(t) for t in examples["text"]]
    
    return tokenizer(
        cleaned_batch_texts,    # Batch of 1000 cleaned sentences
        examples["aspect"],     # Batch of 1000 aspects
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LEN
    )

In [80]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 17045/17045 [00:02<00:00, 8409.24 examples/s] 
Map: 100%|██████████| 2220/2220 [00:00<00:00, 6460.35 examples/s]


# Cleanup

In [81]:
tokenized_datasets.column_names

{'train': ['text',
  'aspect',
  'label',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'validation': ['text',
  'aspect',
  'label',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

In [82]:
columns_to_keep = ['input_ids', 'attention_mask', 'token_type_ids', 'label']
tokenized_datasets.set_format(type='torch', columns=columns_to_keep)

# Save

In [86]:
print(f"Saving processed dataset to {OUTPUT_FILE}...")
tokenized_datasets.save_to_disk(OUTPUT_FILE)
print("✅ Done! Ready for training.")

Saving processed dataset to d:\My Projects\Aspect-Based Sentiment Anslaysis\Data\mams_processed_bert...


Saving the dataset (1/1 shards): 100%|██████████| 17045/17045 [00:00<00:00, 146261.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2220/2220 [00:00<00:00, 177207.25 examples/s]

✅ Done! Ready for training.



