In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q torch
!pip install -U -q accelerate transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

In [4]:
# File paths for training, validation and testing
SYNTHETIC_FILE_PATH_1 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Training_Data_Reviews.xlsx'
SYNTHETIC_FILE_PATH_2 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/ASPECT_REVIEW.xlsx'
SYNTHETIC_FILE_PATH_3 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Chat_Training_Data.xlsx'
MANUAL_FILE_PATH_1 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/AMZ_VALIDATION.xlsx'
MANUAL_FILE_PATH_2 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Amazon_Reviews_Test_Data_1.xlsx'

In [5]:
# reading Training data
synthetic_data_1 = pd.read_excel(SYNTHETIC_FILE_PATH_1)
synthetic_data_1 = synthetic_data_1.drop_duplicates(subset=['Review'])
synthetic_data_1 = synthetic_data_1.dropna()

synthetic_data_2 = pd.read_excel(SYNTHETIC_FILE_PATH_2)
synthetic_data_2 = synthetic_data_2.drop_duplicates(subset=['Review'])
synthetic_data_2 = synthetic_data_2.dropna()

synthetic_data_3 = pd.read_excel(SYNTHETIC_FILE_PATH_3)
synthetic_data_3 = synthetic_data_3.drop_duplicates(subset=['Review'])
synthetic_data_3 = synthetic_data_3.dropna()

synthetic_data = pd.concat([synthetic_data_1,synthetic_data_2,synthetic_data_3], ignore_index = True)

In [6]:
# reading Testing data
manual_data_1 = pd.read_excel(MANUAL_FILE_PATH_1)
manual_data_1 = manual_data_1.drop_duplicates(subset=['Review'])
manual_data_1 = manual_data_1.dropna()

manual_data_2 = pd.read_excel(MANUAL_FILE_PATH_2)
manual_data_2 = manual_data_2.drop_duplicates(subset=['Review'])
manual_data_2 = manual_data_2.dropna()

manual_data = pd.concat([manual_data_1,manual_data_2], ignore_index = True)

In [7]:
synthetic_data = synthetic_data.reset_index(drop=True)
manual_data = manual_data.reset_index(drop=True)

In [8]:
synthetic_data = synthetic_data.drop_duplicates(subset=['Review'])
manual_data = manual_data.drop_duplicates(subset=['Review'])

In [9]:
len(synthetic_data)

7217

In [10]:
len(manual_data)

1443

In [11]:
# for synthetice data
synth_train, synth_test, _, _ = train_test_split(synthetic_data, synthetic_data.Aspect, test_size=0.2, random_state=42, stratify=synthetic_data.Aspect)
# for manual data
manu_train, manu_test, _, _ = train_test_split(manual_data, manual_data.Aspect, test_size=0.2, random_state=42, stratify=manual_data.Aspect)

# combine the dataset in 80:20 ratio and creating the training and testing data
training_data = pd.concat([synth_train,manu_train], ignore_index=True)
testing_data = pd.concat([synth_test,manu_test], ignore_index=True)

In [12]:
# combine the dataset in 80:20 ratio and creating the training and testing data
training_data = pd.concat([synth_train,manu_train], ignore_index=True)
testing_data = pd.concat([synth_test,manu_test], ignore_index=True)

In [13]:
len(training_data)

6927

In [14]:
len(testing_data)

1733

In [15]:
original_labels = [
    'Adaptability', 'Durability', 'Ease of Use', 'Ergonomics',
    'Interference', 'Performance', 'Use Efficiency', 'Aesthetics',
    'Ease of Reprocessing', 'Ease of Storage', 'Price', 'Safety'
]

# Map labels to numerical values (0 to 12)
labeling_dict = {label: idx for idx, label in enumerate(original_labels)}
labeling_dict_reverse = {idx: label for idx, label in enumerate(original_labels)}

In [16]:
training_data['Aspect'] = training_data['Aspect'].map(labeling_dict)
testing_data['Aspect'] = testing_data['Aspect'].map(labeling_dict)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=12)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
X_train = training_data["Review"].to_list()
X_test = testing_data["Review"].to_list()
y_train = training_data["Aspect"].to_list()
y_test = testing_data["Aspect"].to_list()

In [19]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [20]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    f1score = f1_score(y_true=labels, y_pred=pred, average='weighted')
    return {"accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1score}

In [21]:
train_dataset = Dataset(X_train_tokenized, y_train)
test_dataset = Dataset(X_test_tokenized, y_test)

In [22]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    evaluation_strategy="steps",  # Specify the evaluation strategy
    eval_steps=100,  # Evaluate every 500 training steps (you can adjust this value)
    logging_dir="logs",  # Directory for TensorBoard logs
    logging_steps=100,  # Log metrics every 100 steps (you can adjust this value)
    save_steps=100
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Provide the validation dataset
    compute_metrics=compute_metrics,
)


In [23]:
# start the model training
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Score
100,1.9107,1.055418,0.726486,0.762222,0.726486,0.722772
200,0.8696,0.625888,0.814772,0.818106,0.814772,0.81442
300,0.5974,0.541666,0.850548,0.857326,0.850548,0.849711
400,0.5063,0.451567,0.874207,0.880823,0.874207,0.874961
500,0.3418,0.444042,0.882285,0.887333,0.882285,0.882736
600,0.2846,0.442912,0.882285,0.884106,0.882285,0.882537
700,0.2425,0.468612,0.883439,0.88754,0.883439,0.88359
800,0.2567,0.495154,0.878246,0.882737,0.878246,0.878021
900,0.1859,0.422631,0.898442,0.900476,0.898442,0.898586
1000,0.0987,0.439708,0.899596,0.901431,0.899596,0.900041


TrainOutput(global_step=2165, training_loss=0.2669050394387498, metrics={'train_runtime': 3702.8312, 'train_samples_per_second': 9.354, 'train_steps_per_second': 0.585, 'total_flos': 8277063218703000.0, 'train_loss': 0.2669050394387498, 'epoch': 5.0})

In [27]:
model_validation_metrics=trainer.evaluate()

In [28]:
model_validation_metrics

{'eval_loss': 0.5554799437522888,
 'eval_accuracy': 0.9036353144835545,
 'eval_precision': 0.9052186683814125,
 'eval_recall': 0.9036353144835545,
 'eval_f1_score': 0.903978423030446,
 'eval_runtime': 38.2756,
 'eval_samples_per_second': 45.277,
 'eval_steps_per_second': 5.669,
 'epoch': 5.0}

In [32]:
# to save the model
# uncomment and change the path to required folder to save the files
 #trainer.save_model('/content/drive/MyDrive/ASPECT ANALYSIS ALL/BERT_MODEL_FILES/CustomModel91acc')
 #tokenizer.save_pretrained("/content/drive/MyDrive/ASPECT ANALYSIS ALL/BERT_MODEL_FILES/Tokenizer91acc")