# Fine-tuning
1. Preparation of the training & validation dataset to fine-tune the model
2. Using the fine tuned model with the test dataset

In [8]:
import json #for the format
import tiktoken # for token counting
import numpy as np
from collections import defaultdict
import PyPDF2
import nltk
import pdfplumber
import nltk
nltk.download('punkt')
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariajahnke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Extracting and tokenising ESG reports

The following code block converts pdf files to text. For each individual PDF file `pdf_path` (input) and `text_file_path` (output) must be manually set.

Converting process:
- loading each ESG pdf individually from pdf_path 
- extracting text using the PyPDF2.PdfReader in binary read mode ('rb')
- output one txt file per ESG report to text_file_path

In [None]:
#tranform pdf to txt
def extract_text(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        total_tokens = []
        
        # go over each page in the PDF
        for page in reader.pages:
            # Extract text from the page
            page_text = page.extract_text() if page.extract_text() else ''
            # Tokenize the text
            tokens = nltk.word_tokenize(page_text)
            total_tokens.extend(tokens)

        # Return the text accumulated up to the token limit
        final_text = ' '.join(total_tokens)
        return final_text

# Path to your PDF file, needs to be changed for each pdf file manually
pdf_path = 'esg_reportings/Adidas.pdf'

# Extracted text from the PDF
extracted_text = extract_text(pdf_path)

# Path for the output text file
text_file_path = 'finetuning-input/adidas.txt'

# Write the extracted text to a new text file
with open(text_file_path, 'w', encoding='utf-8') as file:
    file.write(extracted_text)

#answer if completed
print(f"Text has been extracted and saved to {text_file_path}.")


# Validating text length

Validating the token length of the ESG txt files:
- setting max tokens for ESG report to 15.000
- counting tokens with tokenizer for GPT-3.5 using `tiktoken.get_encoding("cl100k_base")`
- iterating over each txt file in the directory using `os.listdir(directory)`
- if the file is too long, the file name will be printed to the console and can then be shortened manually




In [9]:
#check which reports needs to be manually shortend


def verify_text_file_length(input_file_path, max_tokens=15000):
    # Initialize the GPT-3.5 tokenizer
    enc = tiktoken.get_encoding("cl100k_base")  # Adjust this if a different encoding is needed for GPT-3.5

    # Function to read the content of the file with different encodings
    def read_file_with_fallback(path):
        encodings = ['utf-8', 'latin1', 'utf-16', 'cp1252']
        for encoding in encodings:
            try:
                with open(path, 'r', encoding=encoding) as file:
                    return file.read()
            except (UnicodeDecodeError, FileNotFoundError):
                continue
        raise UnicodeDecodeError("Unable to decode the file with provided encodings")

    # Read the content of the input file
    text = read_file_with_fallback(input_file_path)

    # Tokenize the text
    tokens = enc.encode(text)
    
    # Check if the token count exceeds the max_tokens limit
    if len(tokens) > max_tokens:
        print(f"The file {input_file_path} exceeded {max_tokens} tokens and needs to be shortened.")


directory = './finetuning-input'

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    if os.path.isfile(file_path):
        verify_text_file_length(input_file_path=file_path)


# Construction of training & validation dataset 
Constructing the training and validation JSONL using the inputs:
- shortened company txt file embedded into OpenAI instructions
- G1-3 regulation txt file embedded into OpenAI instructions
- `user_prompt` and expected `assistant_answer` from [finetuning_training.csv](finetuning/finetuning_training.csv) and [finetuning_validation.csv](finetuning/finetuning_validation.csv)

In [14]:
import csv
import json

def generate_jsonl(input_csv_file_path, output_json_file_path):
    # Open the CSV
    with open(input_csv_file_path, 'r') as csv_file:
        # Create a CSV reader
        csv_reader = csv.DictReader(csv_file, delimiter=";")
        
        # Convert each row into a dictionary and add it to a list
        data = [row for row in csv_reader]


    with open("./finetuning-input/G1-3.txt", 'r') as regulations_file:
        regulations_text = regulations_file.read()

        with open(output_json_file_path, 'w') as file:
            # Write each line to the file
            for line in data:
                
                report_file_path = line['report_txt_path']
                with open(report_file_path, 'r') as report_file:
                    report_text = report_file.read()

                    prompt_instruction = "You are an ESG analyst answering questions about the provided ESG report in regards to compliance with regulations.\n\n # REGULATIONS:\n " + regulations_text + " \n\n-----------------\n\n # ESG-REPORTING:\n" + report_text
                    #structure of the json file       
                    row = {
                        "messages": [
                            {"role": "system", "content": prompt_instruction},
                            {"role": "user", "content": line['user_prompt']},
                            {"role": "assistant", "content": line['assistant_answer']}
                        ]
                    }
                    #adding line break to match jsonl format
                    file.write(json.dumps(row) + '\n')


# Input csv manually created training dataset & output newly created json file
generate_jsonl(
    input_csv_file_path="finetuning/finetuning_training.csv", 
    output_json_file_path="finetuning/training.jsonl"
)
generate_jsonl(
    input_csv_file_path="finetuning/finetuning_validation.csv", 
    output_json_file_path="finetuning/validation.jsonl"
)

# Prompting

- creating one single thread per company -> reusing the same thread for prompt_2 and prompt_3 (chain of thoughts prompting technique)

In [21]:
#same as few_shot with using the excel file to upload it in OpenAI
from openai import OpenAI
import pandas as pd

client = OpenAI(
  api_key="personal_API_key"
)

def runThread(assistant_id, thread_id, prompt, prompt_instructions):
  assistant = client.beta.assistants.retrieve(
    assistant_id=assistant_id
  )

  client.beta.assistants.update(
    assistant_id= assistant_id,
    instructions=prompt_instructions
  )

  thread = client.beta.threads.retrieve(
    thread_id=thread_id
  )

  # add message to thread
  client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=prompt
  )

  run = client.beta.threads.runs.create_and_poll(
      thread_id=thread.id,
      assistant_id=assistant.id,
      additional_instructions=prompt_instructions
  )

  threadMessages = client.beta.threads.messages.list(thread.id)

  assistantMessage = next((i for i in threadMessages if i.role == "assistant" and i.run_id == run.id), None)
  return assistantMessage.content[0].text.value

In [22]:
import pandas as pd

def read_file_as_text(path):
    with open(path, 'r') as file:
        return file.read()

excel_file_path = "analysis/finetuning.xlsx"

# Load the Excel file
df = pd.read_excel(excel_file_path, sheet_name='Sheet1', keep_default_na=False)

regulations_text = read_file_as_text("./finetuning-input/G1-3.txt")

# Iterate over DataFrame rows
for i in df.index:

    report_text = read_file_as_text(df.loc[i, 'instructions'])

    prompt_instructions = "You are an ESG analyst answering questions about the provided ESG report in regards to compliance with regulations.\n\n # REGULATIONS:\n " + regulations_text + " \n\n-----------------\n\n # ESG-REPORTING:\n" + report_text

    if df.loc[i, 'answer_1'] == '':
        thread = client.beta.threads.create()

        answer1 = runThread(
            assistant_id=df.loc[i, 'assistant_id'],
            thread_id=thread.id,
            prompt=df.loc[i, 'prompt_1'],
            prompt_instructions=prompt_instructions
        )
        df.loc[i, 'answer_1'] = answer1

        if df.loc[i, 'answer_2'] == '':
            answer2 = runThread(
                assistant_id=df.loc[i, 'assistant_id'],
                thread_id=thread.id,
                prompt=df.loc[i, 'prompt_2'],
                prompt_instructions= prompt_instructions
            )
            df.loc[i, 'answer_2'] = answer2

            if df.loc[i, 'answer_3'] == '':
                answer3 = runThread(
                    assistant_id=df.loc[i, 'assistant_id'],
                    thread_id=thread.id,
                    prompt=df.loc[i, 'prompt_3'],
                    prompt_instructions= prompt_instructions
                )
                df.loc[i, 'answer_3'] = answer3  

        # Save the DataFrame back to the Excel file
        df.to_excel(excel_file_path, index=False, sheet_name='Sheet1')

df


Unnamed: 0,instructions,assistant_id,prompt_1,answer_1,prompt_2,answer_2,prompt_3,answer_3
0,finetuning-input/Symrise.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...","Symrise: The Code of Conduct, which applies gl...",x,x,x,x
1,finetuning-input/Vonovia.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Vonovia SE.\n-...,x,x,x,x
2,finetuning-input/VW.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about the ESG report...,x,x,x,x
3,finetuning-input/Zalando.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Zalando:\n- Za...,x,x,x,x
4,finetuning-input/DeutscheTelekom.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Deutsche Telek...,x,x,x,x
5,finetuning-input/Symrise.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Symrise and it...,"Provide a short,precise and structured compreh...",The following analysis is about Symrise and it...,"Provide a short,precise and structured compreh...",The following analysis is about Symrise and it...
6,finetuning-input/Vonovia.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following is a compliance check for Vonovi...,"Provide a short,precise and structured compreh...",The following is a compliance check for Vonovi...,"Provide a short,precise and structured compreh...",The following is an analysis of Vonovia SE's E...
7,finetuning-input/VW.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about the Volkswagen...,"Provide a short,precise and structured compreh...",The following analysis is about the Volkswagen...,"Provide a short,precise and structured compreh...",The following analysis is about the Volkswagen...
8,finetuning-input/Zalando.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Zalando. The E...,"Provide a short,precise and structured compreh...",The following analysis is about Zalando. The E...,"Provide a short,precise and structured compreh...",The following analysis is about Zalando. The E...
9,finetuning-input/DeutscheTelekom.txt,asst_VyMI4bAcjEea5eQ20D9QGubl,"Provide a short,precise and structured compreh...",The following analysis is about Deutsche Telek...,"Provide a short,precise and structured compreh...",The following part is about the independence o...,"Provide a short,precise and structured compreh...",The following part is about the training progr...
