# install the transformers library
!pip uninstall -y transformers accelerate
!pip install transformers accelerate
!pip install --upgrade transformers

# import file stored on Google Drive
from google.colab import drive
drive.mount('/content/drive')

! export PYTORCH_CUDA_ALLOC_CONF=0:9500

!pip install datasets transformers accelerate
!pip install torch
!pip install evaluate

In [None]:
import torch
print(torch.__version__)
import pandas as pd
import numpy as np
from datasets import Dataset
import os
# specify your filename
input_folder_path = os.path.join(os.getcwd(), "input")
merged_csv_filename = "merged_data.csv"
merged_csv_filepath = os.path.join(input_folder_path, merged_csv_filename)
text_column1 = "Comment"  # select the column in your csv that contains the text to be classified
text_column2 = "Class"
# read in csv
dataset1 = Dataset.from_csv(merged_csv_filepath)

eval_metrics_file = "Retrain_eval_metrics.csv"
num_epochs=1 #20
out_dir = "Models/"

In [None]:
print(dataset1)


In [None]:
import collections
num_labels_dataset = len(dataset1.unique("Class"))
print("Number of labels in the dataset:", num_labels_dataset)

# Assuming dataset1 is a list of dictionaries
class_counts = collections.Counter(data_point['Class'] for data_point in dataset1)

# Print unique classes along with their counts
print("Unique classes and their counts:")
for class_label, count in class_counts.items():
    print(class_label, count)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Convert the Dataset to a pandas DataFrame
df = dataset1.to_pandas()

# Now, split the data based on classes
unique_labels = df['Class'].unique()
train_dataframes = []
test_dataframes = []

for label in unique_labels:
    class_subset = df[df['Class'] == label]
    train_df = class_subset.head(250)
    test_df = class_subset.tail(41)
    
    train_dataframes.append(train_df)
    test_dataframes.append(test_df)

train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

# Rename the columns for consistency
train_df = train_df.rename(columns={"Comment": "text", "Class": "label"})
test_df = test_df.rename(columns={"Comment": "text", "Class": "label"})

# Convert them back to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset1_split = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Define hardcoded label mapping for the specified emotions
label_mapping = {
    "Anger": 0,
    "Love": 1,
    "Fear": 2,
    "Joy": 3,
    "Sadness": 4,
    "Surprise": 5,
    "Neutral": 6
}

# Map the labels in both training and testing datasets using the hardcoded mapping
train_df["label"] = train_df["label"].map(label_mapping)
test_df["label"] = test_df["label"].map(label_mapping)

# Convert the DataFrames back to Dataset objects
sorted_train_dataset = Dataset.from_pandas(train_df)
sorted_test_dataset = Dataset.from_pandas(test_df)

sorted_dataset1 = DatasetDict({
    "train": sorted_train_dataset,
    "test": sorted_test_dataset
})


In [None]:
sorted_dataset1

In [None]:
test_filepath = os.path.join(input_folder_path, "CommentsFromLang_Test.csv")
add_testDS = Dataset.from_csv(test_filepath)
df_add_testDS = add_testDS.to_pandas()
addtest_df = df_add_testDS.rename(columns={"Comment": "text", "Class": "label"})
# Convert them back to Dataset objects
addtest_dataset = Dataset.from_pandas(addtest_df)
addtest_df["label"] = addtest_df["label"].map(label_mapping)
sorted_addtest_dataset = Dataset.from_pandas(addtest_df)

# Define the path to save the CSV file
sorted_test_dataset_path = input_folder_path+"/sorted_test_dataset.csv"
# Write sorted_dataset1["test"] to the CSV file
dataset1_split["test"].to_csv(sorted_test_dataset_path)

import pandas as pd
import os

# Define the file paths
file1_path = os.path.join(input_folder_path, "CommentsFromLang_Test.csv")
file2_path = os.path.join(input_folder_path, "sorted_test_dataset.csv")

# Load the CSV files into pandas dataframes
df1 = pd.read_csv(file1_path)
df1 = df1.rename(columns={"Comment": "text", "Class": "label"})
df2 = pd.read_csv(file2_path)

# Concatenate the two dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)
# Save the merged dataframe to a new CSV file
merged_file_path = os.path.join(input_folder_path, "test_merged_dataset.csv")
merged_df.to_csv(merged_file_path, index=False)


import pandas as pd
test_filepath = os.path.join(input_folder_path, "merged_dataset.csv")
add_testDS = Dataset.from_csv(test_filepath)
addtest_df = add_testDS.to_pandas()
addtest_df["label"] = addtest_df["label"].map(label_mapping)
merged_dataset = Dataset.from_pandas(addtest_df)


In [None]:
dataset1

merged_dataset

In [None]:
from datasets import concatenate_datasets

# Assuming you've already loaded your datasets
concatenated_dataset = concatenate_datasets([sorted_dataset1["test"], sorted_addtest_dataset])


In [None]:
model_name = 'j-hartmann/emotion-english-distilroberta-base'

#Prepare dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets1 = sorted_dataset1.map(tokenize_function, batched = True)
# tokenized_merged_dataset = merged_dataset.map(tokenize_function, batched = True)
tokenized_concatenated_dataset = concatenated_dataset.map(tokenize_function, batched = True)

In [None]:
small_train_dataset = tokenized_datasets1["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets1["test"].shuffle(seed=42)
# small_tokenized_merged_dataset = tokenized_merged_dataset.shuffle(seed=42)
small_tokenized_concatenated_dataset = tokenized_concatenated_dataset.shuffle(seed=42)

In [None]:
small_eval_dataset

In [None]:
# small_tokenized_merged_dataset
small_tokenized_concatenated_dataset

In [None]:
# Assuming you've tokenized your merged dataset and it's named `small_tokenized_merged_dataset`
filtered_dataset = small_tokenized_concatenated_dataset.filter(lambda example: example['label'] is not None)
small_tokenized_concatenated_dataset = filtered_dataset
small_tokenized_concatenated_dataset

import pandas as pd
import os
from datasets import concatenate_datasets
# Combine the two datasets
# Check and update label types for small_eval_dataset
if str(small_eval_dataset.features['label']) == "Value(dtype='float64', id=None)":
    def convert_label_to_int(x):
        if x['label'] is not None:
            return {"label": int(x['label'])}
        return x
    small_eval_dataset = small_eval_dataset.map(convert_label_to_int)

# Check and update label types for small_addtest_dataset
if str(small_addtest_dataset.features['label']) == "Value(dtype='float64', id=None)":
    def convert_label_to_int(x):
        if x['label'] is not None:
            return {"label": int(x['label'])}
        return x
    small_addtest_dataset = small_addtest_dataset.map(convert_label_to_int)

# Now try to concatenate
combined_test_dataset = concatenate_datasets([small_eval_dataset, small_addtest_dataset])


In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)

training hyperparmeters


In [None]:
out_dir = "Models/"

from transformers import TrainingArguments
training_args = TrainingArguments(output_dir = out_dir)

Evaluate

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
#Before passing your predictions to compute, you need to convert the predictions to logits
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

! export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb=20"

In [None]:
from transformers import TrainerCallback, TrainerControl

class LoggingCallback(TrainerCallback):
    def __init__(self, trainer, eval_dataset, csv_file_path, model_name):
        self.trainer = trainer
        self.eval_dataset = eval_dataset
        self.csv_file_path = csv_file_path
        self.model_name = model_name

    def on_epoch_end(self, args, state, control: TrainerControl, **kwargs):
#         model_name = args.model_name_or_path

        eval_results = self.trainer.predict(self.eval_dataset)
        logits = eval_results.predictions
        labels = eval_results.label_ids
        predicted_labels = np.argmax(logits, axis=-1)

        # Compute metrics
        accuracy = accuracy_score(labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='weighted')

        # Data to append
        data = {
            "model": [self.model_name],
            "#TrainRows": [len(trainer.train_dataset)],
            "#TestRows": [len(self.eval_dataset)],
            "#epochs": [state.epoch],
            "accuracy": [accuracy],
            "precision": [precision],
            "recall": [recall],
            "f1": [f1]
        }

        # Append the new data to the file
        df = pd.DataFrame(data)
        df.to_csv(self.csv_file_path, mode='a', header=not os.path.isfile(self.csv_file_path), index=False)


In [None]:
from transformers import TrainingArguments, Trainer
# Define the training arguments
training_args = TrainingArguments(
    output_dir=out_dir,
    save_strategy="epoch",
#     save_steps=1000,  # Save model every 100 steps
    num_train_epochs=num_epochs,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
)

# # Create the Trainer instance
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[LoggingCallback(combined_test_dataset, eval_metrics_file, model_name)]
# )
# Initialize the Trainer instance first
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# Then, add the callback with the trainer instance
logging_callback = LoggingCallback(trainer, small_tokenized_concatenated_dataset, eval_metrics_file, model_name)
trainer.add_callback(logging_callback)

In [None]:
#before the fine-tuning
eval_results = trainer.predict(small_tokenized_concatenated_dataset)
logits = eval_results.predictions
labels = eval_results.label_ids
predicted_labels = np.argmax(logits, axis=-1)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score

# Compute confusion matrix
cm = confusion_matrix(labels, predicted_labels)

# Compute other metrics
accuracy = accuracy_score(labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='weighted')
# Assuming binary classification for AUC calculation (you might need to adjust this for multi-class)
# auc = roc_auc_score(labels, logits[:, 1])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
# print("AUC:", auc)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix (Before Training)')
plt.show()


In [None]:
import pandas as pd
import os
# Sample values (replace with actual values as required)
data = {
    "model": [model_name],  # Replace with your actual model name
    "#TrainRows": ["before_train"],  # Number of training data rows
    "#TestRows": [len(small_tokenized_concatenated_dataset)],  # Sum of test datasets rows
    "#epochs": ["NA"],  # Replace with your actual number of epochs if different
    "accuracy": [accuracy],  # Accuracy from your previous computations
    "precision": [precision],  # Precision from your previous computations
    "recall": [recall],  # Recall from your previous computations
    "f1": [f1]  # F1 Score from your previous computations
}

# Headers
headers = ["model", "#TrainRows", "#TestRows", "#epochs", "accuracy", "precision", "recall", "f1"]

# Check if file exists, if not create it with headers
if not os.path.isfile(eval_metrics_file):
    with open(eval_metrics_file, 'w') as f:
        f.write(','.join(headers) + '\n')

# Append the new data to the file
df = pd.DataFrame(data)
df.to_csv(eval_metrics_file, mode='a', header=False, index=False)


In [None]:
import datetime
# Perform the training
before_time = datetime.datetime.now()
before_time_str = before_time.strftime('%Y-%m-%d %H:%M:%S.%f')
print("before time_str", before_time_str)
trainer.train()
after_time = datetime.datetime.now()
after_time_str = after_time.strftime('%Y-%m-%d %H:%M:%S.%f')
print("after time_str", after_time_str)
    
model.save_pretrained(out_dir)

In [None]:
# Evaluate on the validation dataset
eval_results = trainer.evaluate()

# Access the computed metrics
validation_accuracy = eval_results["eval_accuracy"]
validation_accuracy

In [None]:
eval_results = trainer.predict(small_tokenized_concatenated_dataset)
logits = eval_results.predictions
labels = eval_results.label_ids
predicted_labels = np.argmax(logits, axis=-1)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Compute confusion matrix
cm = confusion_matrix(labels, predicted_labels)

# Compute other metrics
accuracy = accuracy_score(labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='weighted')
# Assuming binary classification for AUC calculation (you might need to adjust this for multi-class)
# auc = roc_auc_score(labels, logits[:, 1])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Visualize the confusion matrix using Seaborn's heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
import os

# Sample values (replace with actual values as required)
data = {
    "model": [model_name],  # Replace with your actual model name
    "#TrainRows": [len(small_train_dataset)],  # Number of training data rows
    "#TestRows": [len(small_tokenized_concatenated_dataset)],  # Sum of test datasets rows
    "#epochs": [num_epochs],  # Replace with your actual number of epochs if different
    "accuracy": [accuracy],  # Accuracy from your previous computations
    "precision": [precision],  # Precision from your previous computations
    "recall": [recall],  # Recall from your previous computations
    "f1": [f1]  # F1 Score from your previous computations
}

# Headers
headers = ["model", "#TrainRows", "#TestRows", "#epochs", "accuracy", "precision", "recall", "f1"]

# Check if file exists, if not create it with headers
if not os.path.isfile(eval_metrics_file):
    with open(eval_metrics_file, 'w') as f:
        f.write(','.join(headers) + '\n')

# Append the new data to the file
df = pd.DataFrame(data)
df.to_csv(eval_metrics_file, mode='a', header=False, index=False)
