In [14]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge import Rouge
import torch
from lexical_diversity import lex_div as ld

In [39]:
model_name = 'declare-lab/flan-alpaca-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

df = pd.read_excel('/Users/inigoparra/Desktop/scripting.xlsx')
rouge = Rouge()
results_df = pd.DataFrame()

In [40]:
for index, row in df.iterrows():
    prompt = row['History']
    reference = "History, as a study of past events and societies, provides a comprehensive understanding of human civilization, cultural evolution, political changes, technological advancements, and social transformations."

    if pd.isna(prompt):  
        continue  

    # Encode & Decode
    inputs = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt")
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)  # Create attention mask
    outputs = model.generate(inputs, attention_mask=attention_mask, max_length=50, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    input_ids = tokenizer.encode(response, return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        
    loss = outputs.loss
    perplexity = torch.exp(loss)
    
    scores = rouge.get_scores(response, reference)

    words = response.split()
    mtld_score = ld.mtld(words)

    response_length = len(response)

    result = {
        'Prompt': prompt,
        'Response': response,
        'Perplexity': perplexity.item(),
        'Rouge-L Recall': scores[0]['rouge-l']['r'],
        'Rouge-L Precision': scores[0]['rouge-l']['p'],
        'Rouge-L F1': scores[0]['rouge-l']['f'],
        'MTLD': mtld_score,
        'Response Length': response_length
    }
    
    results_df = results_df.append(result, ignore_index=True)


results_df.to_excel('results.xlsx', index=False)

  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
  results_df = results_df.append(result, ignore_index=True)
