In [4]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
model_name = 'gpt-4'
model = AutoModelForSequenceClassification.from_pretrained(f'models/scibert_{model_name}')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, framework='pt')  # device=0

# Load data
test_df = pd.read_csv('PROCESSED_ZORA_TEST.csv')

# Process the data in batches
test_df['GPT-4'] = test_df.PROCESSED.apply(nlp)

mapping = {f'LABEL_{i}': i for i in range(18)}
test_df.loc[:, 'GPT-4'] = [mapping[pred[0]['label']] for pred in test_df['GPT-4']]

# Optional: Save the predictions
test_df.to_csv('GPT_4_FINAL_PREDICTIONS.csv', index=False)

In [13]:
from sklearn.metrics import accuracy_score, f1_score

In [16]:
print(accuracy_score(test_df['SDG'].to_list(), test_df['GPT-4'].to_list()))
print(f1_score(test_df['SDG'].to_list(), test_df['GPT-4'].to_list(), average='macro'))

0.46153846153846156
0.4422980485642703
