In [None]:
! pip install datasets fast-fit transformers accelerate

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import RobertaTokenizer
from fastfit import FastFitTrainer, sample_dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
USE_COLAB = True

In [None]:
if USE_COLAB:
    base_dir = 'drive/MyDrive/Github/NLPSharedTask'
else:
    base_dir = ''

In [None]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.3,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)

    return train, test

In [None]:
df = pd.read_csv('cleaned_data_with_null_with_synth.csv')

df['label'] = df.label.astype(str)

train_df, temp_df = rule_based_train_test_split(df, random_state=42, test_size=0.3)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Select only the 'text' and 'sdg' columns for the final datasets
train_df = train_df[['text_clean', 'label']]
val_df = val_df[['text_clean', 'label']]
test_df = test_df[['text_clean', 'label']]

# Saving the datasets to CSV files
train_df.to_csv(f'{base_dir}/train_data.csv', index=False)
val_df.to_csv(f'{base_dir}/val_data.csv', index=False)
test_df.to_csv(f'{base_dir}/test_data.csv', index=False)

In [None]:
# FastFit tokenizes the labels, does not like integer labels :(
    # transforming the label to text ones. wasted too much time on this for no reason probably.
import csv

NUMBER_MAPPINGS={
    0:"zero",
    1:"one",
    2:"two",
    3:"three",
    4:"four",
    5:"five",
    6:"six",
    7:"seven",
    8:"eight",
    9:"nine",
    10:"ten",
    11:"eleven",
    12:"twelve",
    13:"thirteen",
    14:"fourteen",
    15:"fifteen",
    16:"sixteen",
    17:"seventeen"
}

def convert_integers_to_strings(file_path, output_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.reader(file)
        data = [row for row in reader]

    # Change integers in the second column to strings
    for row in data:
        if row and len(row) > 1 and row[1] != 'sdg':
            try:
                # Convert the second column to string if it's an integer
                row[1] = NUMBER_MAPPINGS[int(row[1])]
            except ValueError:
                # If it's not an integer, do nothing
                pass

    # Write the updated data to a new CSV file
    with open(output_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)


for path in ['train_data.csv', 'val_data.csv', 'test_data.csv']:
    convert_integers_to_strings(path,path)

In [None]:
# Load your dataset from the CSV files
dataset = load_dataset('csv', data_files={
    'train': f'{base_dir}/train_data.csv',
    'validation': f'{base_dir}/val_data.csv',
    'test': f'{base_dir}/test_data.csv'
})


# Initialize the FastFit trainer with correct column names and paths
trainer = FastFitTrainer(
    model_name_or_path="allenai/scibert_scivocab_cased",
    label_column_name="label",
    text_column_name="text_clean",
    num_train_epochs=40,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    max_text_length=128,
    dataloader_drop_last=False,
    num_repeats=4,
    optim="adafactor",
    clf_loss_factor=0.1,
    fp16=True,
    dataset=dataset,
)

In [None]:
model = trainer.train()

In [None]:
results = trainer.evaluate()
print("Accuracy: {:.1f}%".format(results["eval_accuracy"] * 100))

In [None]:
model.save_pretrained("drive/MyDrive/scibert_all")