#Installations & Imports

Installations

In [None]:
!pip install setfit
!pip install optuna

Imports

In [2]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer, SetFitModel
from datasets import Dataset

import huggingface_hub as hf_hub
import pandas as pd
import numpy as np
import torch
import csv
import re

#Set Consts

In [None]:
TRAIN_FILE_PATH = '/content/sample_data/train-00000-of-00001-b943ea66e0040b18.parquet'
TEST_FILE_PATH = '/content/sample_data/test-00000-of-00001-35e9a9274361daed.parquet'
OUTPUT_SUBMISSION_PATH = '/content/sample_data/submission.csv'
OUTPUT_MODEL_NAME = 'GalSarid/setfit-movie-genre-sentence-t5-xl'

#Explore the Dataset

In [None]:
base_df = pd.read_parquet(TRAIN_FILE_PATH, engine='pyarrow')
base_df.head(20)

Check label distribution

In [5]:
def get_unique_labels(df: pd.DataFrame, group_name: str, print_labels_dist: bool) -> list:
  if print_labels_dist:
    print(f"num of rows: {len(df)}")
  label_rows = df.groupby(group_name)
  if print_labels_dist:
    print(f"num of labels: {len(label_rows)}")
    print(label_rows.size())
  labels = [lr[0] for lr in label_rows]
  return labels

In [None]:
labels  = get_unique_labels(base_df, 'genre', print_labels_dist=True)

#Train model and creat submission file

Clean synopsis

In [7]:
def remove_see_full_synopsis(raw_synopsis: str) -> str:
  clean_synopsis = re.sub(r"[^\x00-\x7F]+","", raw_synopsis)
  clean_synopsis = re.sub("...                See full synopsis", "", clean_synopsis)
  return clean_synopsis

In [None]:
test_id = 22430
test_row = base_df[base_df.id == test_id].iloc[0]
test_str = test_row['synopsis']
print(f'Orig:\n{test_str}')
print(f'Without see full synopsis:\n{remove_see_full_synopsis(test_str)}')

Create genre-id and id-genre dicts

In [None]:
genre_id_mappings = {}
id_genre_mappings = {}

for l,i in zip(labels, range(len(labels))):
  genre_id_mappings.update({l:i})
  id_genre_mappings.update({i:l})

print(genre_id_mappings)
print(id_genre_mappings)

Train test split

In [None]:
base_df["genre_id"] = base_df["genre"].map(genre_id_mappings)
print(base_df.head(10))

In [None]:
hp_train_df = base_df.sample(frac=0.02, random_state=42)
hp_test_df = base_df.drop(hp_train_df.index)
hp_test_df = hp_test_df.sample(frac=0.01, random_state=42)

print(get_unique_labels(hp_train_df, 'genre', print_labels_dist=True))
print(get_unique_labels(hp_test_df, 'genre', print_labels_dist=True))

In [None]:
train_synopsis_df = base_df.sample(frac=0.9, random_state=42)
test_synopsis_df = base_df.drop(train_synopsis_df.index)
print(f'Orig trian length: {len(train_synopsis_df)}')

synopsis_seen = []
valid_ids = []

for row in train_synopsis_df.iterrows():
  if row[-1]['synopsis'] not in synopsis_seen:
    valid_ids.append(row[-1]['id'])
    synopsis_seen.append(row[-1]['synopsis'])

train_synopsis_df = train_synopsis_df.query('id in @valid_ids')

print(f'After duplication cleaning: {len(train_synopsis_df)}')
print(f'Test length without cleaning: {len(test_synopsis_df)}')

Check train-test label distribution

In [None]:
train_labels = get_unique_labels(train_synopsis_df, 'genre', True)
print()
test_labels = get_unique_labels(test_synopsis_df, 'genre', True)

Create text-labels dicts for training

In [None]:
def get_setfit_data_dict(df: pd.DataFrame) -> dict:
    setfit_data_dict = {
        'text': list(map(lambda row: remove_see_full_synopsis(f"{row[-1]['synopsis']}: {row[-1]['synopsis']}"),
                     df.iterrows())),
        'label': list(df['genre_id'])
    }

    return setfit_data_dict

In [15]:
hp_search_data_dict_train = get_setfit_data_dict(hp_train_df)
hp_search_data_dict_test = get_setfit_data_dict(hp_test_df)

train_data_dict = get_setfit_data_dict(train_synopsis_df)
test_data_dict = get_setfit_data_dict(test_synopsis_df)

Set model and model params

In [16]:
num_classes = len(labels)
model_name = 'sentence-transformers/sentence-t5-xl'

Load pretrained t5-xl model

In [None]:
synopsis_data_model = SetFitModel.from_pretrained(model_name)

Init trainer with HP search

In [None]:
def make_model(params=None):
    return SetFitModel.from_pretrained(model_name)

hp_search_trainer = SetFitTrainer(
    model_init=make_model,
    train_dataset=Dataset.from_dict(hp_search_data_dict_train),
    eval_dataset=Dataset.from_dict(hp_search_data_dict_test),
    loss_class=CosineSimilarityLoss,
    column_mapping={"text": "text", "label": "label"},
)

In [None]:
def hyperparameter_search_function(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-2, log=False),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16]),
        "num_iterations": trial.suggest_categorical("num_iterations", [1, 2]),
        "num_epochs": trial.suggest_categorical("num_epochs", [1, 2, 4])
    }

best = hp_search_trainer.hyperparameter_search(hyperparameter_search_function, n_trials=64)

In [None]:
best

Load pretrained t5-xl model

In [None]:
t5_pretrained = SetFitModel.from_pretrained(model_name)

Init setfit trainer

In [None]:
t5_data_trainer = SetFitTrainer(
    model = t5_pretrained,
    train_dataset = Dataset.from_dict(train_data_dict),
    eval_dataset = Dataset.from_dict(test_data_dict),
    loss_class = CosineSimilarityLoss,
    column_mapping = {"text": "text", "label": "label"},
)

Train model with HP found

In [None]:
t5_data_trainer.apply_hyperparameters(best.hyperparameters, final_model=True)
torch.cuda.empty_cache()
t5_data_trainer.train()

Evaluate model

In [None]:
t5_data_trainer.evaluate()

Connect to HF hub and save the model

In [None]:
hf_hub.notebook_login()
t5_data_trainer.push_to_hub(OUTPUT_MODEL_NAME)

Read the test file and make predictions

In [None]:
test_df = pd.read_parquet(TEST_FILE_PATH, engine='pyarrow')
test_synopsis = test_df['synopsis']
test_synopsis_clean = list(map(lambda x: remove_see_full_synopsis(x), test_synopsis.values))

preds = t5_data_trainer.model.predict(test_synopsis_clean)
preds

Create submission csv

In [None]:
def create_submission_dataset(df: pd.DataFrame,
                              preds: list,
                              mappings: dict,
                              path: str) -> bool:
    if len(df) != len(preds):
      return False

    submission_file_data = []
    headers = ['id', 'genre']
    submission_file_data.append(headers)

    for i in range(len(preds)):
      submission_file_data.append(
          [df.iloc[i]['id'], mappings.get(int(preds[i]))]
      )

    with open(path, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(submission_file_data)

In [None]:
create_submission_dataset(test_df, preds, id_genre_mappings, OUTPUT_SUBMISSION_PATH)