In [7]:
import torch, json, os
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd

In [8]:
models_base = '/Volumes/SSK2tb/nlp'
# all the models are ~10gbs

In [40]:
base_models = ['muppet','scibert']
finetuned_variants = [
    'no_synth'
    ,'llama'
    ,'gpt-4'
    ,'ensemble'
    ,'mixtral'
]
models = [f"{b}_{v}" for b in base_models for v in finetuned_variants]
# base = base_models[1]
# fine = [finetuned_variants[2], finetuned_variants[3], finetuned_variants[4]]
# models = [f"{base}_{v}" for v in fine]



In [41]:
data_file_path = 'testdata_results.jsonl'
data = []
with open(data_file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

In [42]:
# Extract abstracts and labels
abstracts = [entry['ABSTRACT'] for entry in data]
titles = [entry['TITLE'] for entry in data]
true_labels = [entry['SDG'] for entry in data]

In [43]:
actual_data = pd.read_csv('/Users/andrew_yos/Downloads/PROCESSED_ZORA_TEST.csv')
abstracts = actual_data['PROCESSED']
# titles = [entry['TITLE'] for entry in data]
true_labels = actual_data['SDG']

In [44]:
from main_text_preprocessing import process_text
def predict_using_model(model_path, conc=False):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    def classify_text(text):
        # preprocess the text
        # text = process_text(text)
        
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    
        # Get the model predictions
        with torch.no_grad():
            try:
                outputs = model(**inputs)
            except Exception as e:
                print(f'Error for text {text}: {e}')
                raise e
                
    
        # Get the predicted class
        predictions = torch.argmax(outputs.logits, dim=1)
        return predictions.item()
    
    predicted_labels = [classify_text(a) for a in abstracts]
    # if conc:
    #     concatenated_texts = [title + " " + abstract for title, abstract in zip(titles, abstracts)]
    #     predicted_labels = [classify_text(a) for a in concatenated_texts]
    # else:
    #     predicted_labels = [classify_text(a) for a in abstracts]
     
    return predicted_labels
    

In [45]:
predictions_file_path = 'all_predictions_preprocessed_data.csv'
def predict_all_and_save():
    # Initialize a dictionary to hold predictions
    all_predictions = {'truth': true_labels}

    for model in models:
        print(f"Predicting using model {model}")
        path = os.path.join(models_base, model)
        predictions = predict_using_model(path, abstracts)
        all_predictions[model] = predictions

    # Create a DataFrame from the predictions dictionary
    df = pd.DataFrame(all_predictions)
    # path = os.path.join(models_base, predictions_file_path)
    # Save the DataFrame to a CSV file
    df.to_csv(predictions_file_path, index=False)
predict_all_and_save()

Predicting using model muppet_no_synth
Predicting using model muppet_llama
Predicting using model muppet_gpt-4
Predicting using model muppet_ensemble
Predicting using model muppet_mixtral
Predicting using model scibert_no_synth
Predicting using model scibert_llama
Predicting using model scibert_gpt-4
Predicting using model scibert_ensemble
Predicting using model scibert_mixtral


In [38]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(df, variant):
    # Extract true and predicted labels
    y_true = df['truth']
    y_pred = df[variant]
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate macro metrics
    macro_precision = precision_score(y_true, y_pred, average='macro')
    macro_recall = recall_score(y_true, y_pred, average='macro')
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    
    # Calculate weighted metrics
    weighted_precision = precision_score(y_true, y_pred, average='weighted')
    weighted_recall = recall_score(y_true, y_pred, average='weighted')
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    
    # Create a dictionary to store the results
    metrics = {
        'accuracy': accuracy,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1-score': macro_f1,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall,
        'weighted_f1-score': weighted_f1
    }
    
    return metrics

# Example usage:
# df = pd.DataFrame({'truth': [...], 'muppet_gpt-4': [...]})
# metrics = calculate_metrics(df)
# print(metrics)
p = os.path.join(models_base, predictions_file_path)
print(f"reading {p}")
df = pd.read_csv(p)
all_metrics = []
for f in fine:
    metrics = calculate_metrics(df, f'{base}_{f}')
    all_metrics.append(metrics)
    
sep = '\t\t'

headers = ['model_name'] + [str(m) for m in all_metrics[0].keys()]  

print(sep.join(headers))
for i,m in enumerate(all_metrics):
    print(sep.join([fine[i]]+ [str(m[h]) for h in headers[1:]]))

reading /Volumes/SSK2tb/nlp/predictions_preprocessedtext_submitted_models.csv
model_name		accuracy		macro_precision		macro_recall		macro_f1-score		weighted_precision		weighted_recall		weighted_f1-score
gpt-4		0.47435897435897434		0.4649462165236193		0.5205488621151272		0.4438127458960792		0.6459375954607063		0.47435897435897434		0.5149256637237406
ensemble		0.44871794871794873		0.41030121838945366		0.5215751896474788		0.41486932820266154		0.6226075552150213		0.44871794871794873		0.48155492655492654
mixtral		0.5		0.43493533215755437		0.4919009370816599		0.41268102712353216		0.6413495455162123		0.5		0.5350638289387892


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
all_metrics[1:]

[{'accuracy': 0.44871794871794873,
  'macro_precision': 0.41030121838945366,
  'macro_recall': 0.5215751896474788,
  'macro_f1-score': 0.41486932820266154,
  'weighted_precision': 0.6226075552150213,
  'weighted_recall': 0.44871794871794873,
  'weighted_f1-score': 0.48155492655492654},
 {'accuracy': 0.5,
  'macro_precision': 0.43493533215755437,
  'macro_recall': 0.4919009370816599,
  'macro_f1-score': 0.41268102712353216,
  'weighted_precision': 0.6413495455162123,
  'weighted_recall': 0.5,
  'weighted_f1-score': 0.5350638289387892}]