# This notebook contains script for finetuning gpt-3.5-turbo-0125 model for Environmental Injustice Datasets
## installing the required packages

In [13]:
!pip install openai==1.12.0
!pip install pandas
!pip install beautifulsoup4
!mkdir train_data
!pip install python-dotenv
!pip install nltk


mkdir: train_data: File exists


## Importing required packages

In [15]:
import ssl
import nltk
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from openai import OpenAI
import os
import json

# Download NLTK resources
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
# Load environment variables from .env file
load_dotenv()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajashreedahal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajashreedahal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the train data and extracting content from source url

In [20]:

# Access the API key from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")
train_data=pd.read_csv("EJ_datasets.csv",encoding='latin1')

final_data={}

for each_data in train_data.iterrows():
    if "doi" in each_data[1].iloc[11]:
        final_data[each_data[1].iloc[11]]={"Indicators":each_data[1].iloc[1],
                  "Geographic_Coverage":each_data[1].iloc[4],
                  "Format":each_data[1].iloc[5],
                  "Spatial_Resolution":each_data[1].iloc[7],
                  "Temporal_Resolution":each_data[1].iloc[8],
                  "Temporal_Extent":each_data[1].iloc[9]}
        
for url,value in final_data.items():
        response=requests.get(url)
        html_page = response.text
        soup = BeautifulSoup(html_page, "html.parser")
        text=soup.get_text()
        text=text.lower()
        # Tokenization
        tokens = word_tokenize(text) 
        # Remove punctuation and make lowercase
        tokens = [word.lower() for word in tokens if word.isalnum()]
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        text=" ".join(tokens)
        text = re.sub(r'[\t\n\r\f\v]+', '', text)
        text=re.sub(r'[^\w\s]', '', text)
        final_data[url]["text"]=text



## Prompt preparation for creating training file needed for finetuning. This requires dumping train.json file and creating  a file at OpenAI for fine-tune purpose

In [17]:
_SYSTEM_PROMPT ="Extract metadata and entities details accurately from my requests." + "Metadata extraction for Indicators should clearly be one of the element in the list ['Disasters','Human Dimensions','Food Availability','Health & Air Quality','Water Availability', 'Extreme Heat','Urban Flooding','Climate Change']. Do not create new element for indicators " + "Geographic_Coverage: If multiple countries, write global, else write name of location"+"Make sure the following fields follow the following regex pattern"+"Spatial_Resolution: ^(\d+(\.\d+)? [a-zA-Z]+|varies|N/A)$"+"Temporal_Resolution: ^(\d+(\.\d+)? [a-zA-Z]+|N/A|varies|weekly|monthly|daily|yearly|varies-multiple datasets included|Daily < Weekly|Hourly < Daily|Weekly < Monthly|Monthly < Yearly|1 minute)$"+"Temporal_Extent: ^(\d{4}-\d{2}-\d{2} to present|present|\d{4}-\d{2}-\d{2} (?:to|until) present|\d{4}-\d{2}-\d{2} to \d{4}-\d{2}-\d{2}|varies(?:- multiple datasets (?:included|available))?)$."
        

train_data=[]
for url,value in final_data.items():
    output_content={"Indicators":value["Indicators"],"Geographic_Coverage":value["Geographic_Coverage"], "Format":value["Format"],"Spatial_Resolution":value["Spatial_Resolution"],"Temporal_Resolution":value["Temporal_Resolution"],"Temporal_Extent":value["Temporal_Extent"],"Latency":value["Latency"]}
    output_content=str(output_content)
    data={"messages": [{"role": "system", "content":_SYSTEM_PROMPT},{"role": "user", "content":f'{value["text"]}'}, {"role": "assistant", "content": output_content}]}
    train_data.append(data)
    
with open("EJ_datasets_finetuning3.jsonl", 'w') as jsonl_file:
    for record in train_data:
        jsonl_file.write(json.dumps(record) + '\n')



client = OpenAI(api_key=openai_api_key)

client.files.create(
  file=open("EJ_datasets_finetuning3.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-x2zHzXBaBuQmB1I1TBMsdBCY', bytes=380447, created_at=1711047797, filename='EJ_datasets_finetuning3.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

## Using the updated file id as training_file for finetuning for 10 epochs using gpt-3.5-turbo-0125 model

In [18]:

client.fine_tuning.jobs.create(
  training_file="file-x2zHzXBaBuQmB1I1TBMsdBCY", 
  model="gpt-3.5-turbo-0125", 
  hyperparameters={
    "n_epochs":10
  }
)

FineTuningJob(id='ftjob-W7w5swDcBEcBUlACxmoXMhZZ', created_at=1711047822, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=10, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-KVpMEM9Q8Xm2pc8YuAMYw8r3', result_files=[], status='validating_files', trained_tokens=None, training_file='file-x2zHzXBaBuQmB1I1TBMsdBCY', validation_file=None, user_provided_suffix=None)