In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q torch
!pip install -U -q accelerate transformers
!pip install -q sentencepiece
!pip install --upgrade -q simplet5
!pip install -q sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m952.4/952.4 kB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m63.7 M

In [3]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np
from simplet5 import SimpleT5

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [4]:
# File paths for training, validation and testing
SYNTHETIC_FILE_PATH_1 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Training_Data_Reviews.xlsx'
SYNTHETIC_FILE_PATH_2 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/ASPECT_REVIEW.xlsx'
SYNTHETIC_FILE_PATH_3 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Chat_Training_Data.xlsx'
MANUAL_FILE_PATH_1 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/AMZ_VALIDATION.xlsx'
MANUAL_FILE_PATH_2 = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/ASPECT_ANALYSIS_DATA_FILES/Amazon_Reviews_Test_Data_1.xlsx'

In [5]:
# reading Training data
synthetic_data_1 = pd.read_excel(SYNTHETIC_FILE_PATH_1)
synthetic_data_1 = synthetic_data_1.drop_duplicates(subset=['Review'])
synthetic_data_1 = synthetic_data_1.dropna()

synthetic_data_2 = pd.read_excel(SYNTHETIC_FILE_PATH_2)
synthetic_data_2 = synthetic_data_2.drop_duplicates(subset=['Review'])
synthetic_data_2 = synthetic_data_2.dropna()

synthetic_data_3 = pd.read_excel(SYNTHETIC_FILE_PATH_3)
synthetic_data_3 = synthetic_data_3.drop_duplicates(subset=['Review'])
synthetic_data_3 = synthetic_data_3.dropna()

synthetic_data = pd.concat([synthetic_data_1,synthetic_data_2,synthetic_data_3], ignore_index = True)

In [6]:
# reading Testing data
manual_data_1 = pd.read_excel(MANUAL_FILE_PATH_1)
manual_data_1 = manual_data_1.drop_duplicates(subset=['Review'])
manual_data_1 = manual_data_1.dropna()

manual_data_2 = pd.read_excel(MANUAL_FILE_PATH_2)
manual_data_2 = manual_data_2.drop_duplicates(subset=['Review'])
manual_data_2 = manual_data_2.dropna()

manual_data = pd.concat([manual_data_1,manual_data_2], ignore_index = True)

In [7]:
synthetic_data = synthetic_data.reset_index(drop=True)
manual_data = manual_data.reset_index(drop=True)

In [8]:
synthetic_data = synthetic_data.drop_duplicates(subset=['Review'])
manual_data = manual_data.drop_duplicates(subset=['Review'])

In [9]:
# for synthetice data
synth_train, synth_test, _, _ = train_test_split(synthetic_data, synthetic_data.Aspect, test_size=0.2, random_state=42, stratify=synthetic_data.Aspect)
# for manual data
manu_train, manu_test, _, _ = train_test_split(manual_data, manual_data.Aspect, test_size=0.2, random_state=42, stratify=manual_data.Aspect)

In [10]:
# combine the dataset in 80:20 ratio and creating the training and testing data
training_data = pd.concat([synth_train,manu_train], ignore_index=True)
testing_data = pd.concat([synth_test,manu_test], ignore_index=True)

In [11]:
# preprocessing
training_data['Review'] = training_data['Review'].str.strip().str.lower()
testing_data['Review'] = testing_data['Review'].str.strip().str.lower()

In [12]:
training_data = training_data.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True)

In [13]:
training_data = training_data.rename(columns={'Review': 'source_text', 'Aspect': 'target_text'})
testing_data = testing_data.rename(columns={'Review': 'source_text', 'Aspect': 'target_text'})

In [14]:
# Renaming the labels
label_replacements = {
    'Ease of Use': 'Usability',
    'Ease of Reprocessing': 'Reprocessability',
    'Ease of Storage': 'Storability',
}
# reverseing the dictionary above
label_replacements_reverse = {
    'Usability': 'Ease of Use',
    'Reprocessability': 'Ease of Reprocessing',
    'Storability': 'Ease of Storage',
}

In [15]:
# list of original labels
original_labels = [
    'Adaptability', 'Durability', 'Ease of Use', 'Ergonomics',
    'Interference', 'Performance', 'Use Efficiency', 'Aesthetics',
    'Ease of Reprocessing', 'Ease of Storage', 'Price', 'Safety'
]

modified_labels = [
    'Adaptability', 'Durability', 'Usability', 'Ergonomics',
    'Interference', 'Performance', 'Use Efficiency', 'Aesthetics',
    'Reprocessability', 'Storability', 'Price', 'Safety'
]


In [16]:
# renaming the labels in the training dataset
training_data['target_text'] = training_data['target_text'].replace(label_replacements)
# renaming the labels in the validation dataset
testing_data['target_text'] = testing_data['target_text'].replace(label_replacements)

In [17]:
# SimpleT5 requires that we specify the use case before each review
training_data['source_text'] = "predict Aspect: "+ training_data['source_text']
# SimpleT5 requires that we specify the use case before each review
testing_data['source_text'] = "predict Aspect: "+ testing_data['source_text']

In [18]:
model = SimpleT5()

In [19]:
# to train the small model
model.from_pretrained("t5","t5-small")

# to train the base model
# model.from_pretrained("t5","t5-base")

Downloading:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [20]:
model.train(train_df=training_data, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=testing_data, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512,
            target_max_token_len = 8,
            batch_size = 8,
            max_epochs = 6,
            use_gpu = True,
            outputdir = "output",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [21]:
# # save the best T5 model
# # uncomment to save the models
# import shutil

# # Source folder path (replace 'source_folder' with your folder name)
# source_folder = 'output/simplet5-epoch-5-train-loss-0.0158-val-loss-0.138'

# # Destination folder path in Google Drive (replace 'destination_folder' with your desired folder name)
# destination_folder_drive = '/content/drive/MyDrive/ASPECT ANALYSIS ALL/T5_MODEL_FILES/simplet5-epoch-5-train-loss-0.0158-val-loss-0.138-best80-20-split-basemodel'

# # Copy the folder from the current working directory to Google Drive
# shutil.copytree(source_folder, destination_folder_drive)

# print(f'Folder "{source_folder}" copied to Google Drive at "{destination_folder_drive}"')