In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline, AutoModelForCausalLM
import torch
import json
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
from huggingface_hub import login
hf_api_key = ""
login(hf_api_key)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
os.chdir("")

In [None]:
file_path = 'XBRL Terminology.xlsx'
df_all = pd.read_excel(file_path)

In [None]:
df = df_all[:500]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct").to(device)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
print("load done!")

In [None]:
df_title = pd.DataFrame(columns=['Term', 'Generated Explanation'])
excel_path = 'xbrl_terms_explanations_llama3_7b_500.xlsx'
df_title.to_excel(excel_path, index=False)

In [None]:
terms = df['Term'].tolist()

prompt = "You are an expert in the financial field with deep expertise in the eXtensible Business Reporting Language (XBRL) standard. Please provide detailed explanations of the following XBRL terms: "

generated_explanations = []
for i, term in enumerate(terms):
    input_text = prompt + term
    messages = [{"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": input_text}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=256)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    explanation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    generated_explanations.append(explanation)
    print(f"{i} + {explanation}")

    new_row = pd.DataFrame({'Term': [term], 'Explanation': [explanation]})

    with pd.ExcelWriter(excel_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
        book = writer.book
        startrow = writer.sheets['Sheet1'].max_row
        new_row.to_excel(writer, index=False, header=False, startrow=startrow)
    print(f"{i}" + " saved")

print("finished")