# The aim of this notebook is to set up a tutorial for finetuning metadata extraction process for Environmental Justice Datasets
## installing the required packages

In [1]:
!pip install openai==1.12.0
!pip install pandas
!pip install beautifulsoup4
!pip install python-dotenv
!pip install nltk



## Importing required packages

In [2]:
import ssl
import nltk
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from openai import OpenAI
import os
import json

# Download NLTK resources
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
# Load environment variables from .env file
load_dotenv()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajashreedahal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajashreedahal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Downloading the Environmental Justice Dataset from S3 bucket and looking at the content in the dataset

In [3]:
!aws s3 cp s3://llmworkshop/EJ_datasets.csv .
train_data=pd.read_csv("EJ_datasets.csv",encoding='latin1')
train_data.head()

download: s3://llmworkshop/EJ_datasets.csv to ./EJ_datasets.csv     


Unnamed: 0,Dataset,Indicators,Description,Description Simplified,Geographic Coverage,Format,Spatial Resolution,Spatial Resolution (Standard),Temporal Resolution,Temporal Extent,Latency,Source/Link,Project,Strengths,Limitations,Data Visualization,Intended Use,Tab Name Indicator,Indicators (Select from drop-down list),sde_link
0,ABoVE: Landsat-derived Burn Scar dNBR across A...,Disasters,This dataset contains differenced Normalized B...,The ABoVE: Landsat-derived Burn Scar dNBR acro...,"Alaska, Canada",GeoTIFF,30 meters,30 meters,,1985-01-01 to 2015-12-31,,https://dx.doi.org/10.3334/ORNLDAAC/1564,ABoVE - Arctic-Boreal Vulnerability Experiment,30-year temporal extent,Lacks recent data,,Path C,Disasters,Disasters,https://sciencediscoveryengine.nasa.gov/app/na...
1,AERONET Ground-based AOD Measurements,Health & Air Quality,The AERONET Data Display Interface allows user...,The AERONET Data Display Interface allows user...,Global,,,,,1993-01-01 to Present,NRT,https://aeronet.gsfc.nasa.gov/cgi-bin/draw_map...,AERONET - Aerosol Robotic Network,"NRT, 19-year temporal extent",,Map Viewer,Path C,Health & Air Quality,Health & Air Quality,
2,ARIA (Advanced Rapid Imaging and Analysis) DPM...,"Disasters,Urban Flooding",The Advanced Rapid Imaging and Analysis (ARIA)...,The ARIA (Advanced Rapid Imaging and Analysis)...,Puerto Rico,KML,30 meters,30 meters,,"2017-03-25, 2017-09-21",,https://appliedsciences.nasa.gov/our-impact/ne...,,Visualization available through ArcGIS viewer,Data only available for case study of Puerto R...,ArcGIS viewer,Path B,Urban Flooding,Disasters,
3,ARIA (Advanced Rapid Imaging and Analysis) Dat...,Disasters,"The ARIA Project, a joint effort of the Califo...","The ARIA Project, a joint effort of the Califo...",Global,"GeoTIFF, KMZ",Varies,Varies,Varies,Varies,NRT,https://aria-share.jpl.nasa.gov/,ARIA - Advanced Rapid Imaging and Analysis,NRT,No visualization available,,Path C,Disasters,Disasters,
4,ASTER L2 Surface Temperature V003,"Climate Change,Extreme Heat",The ASTER Surface Kinetic Temperature (AST_08)...,The ASTER Surface Kinetic Temperature (AST_08)...,Global,HDF-EOS2,90 m,90 meters,,2000-03-04 to Present,,https://dx.doi.org/10.5067/ASTER/AST_08.003,"Terra - Earth Observing System (EOS), Terra",22-year temporal extent,Only HDF-EOS2 format available,,Path C,Climate Change,Extreme Heat,https://sciencediscoveryengine.nasa.gov/app/na...


# We will carry out the following preprocessing steps:
1. Only consider those source links which have 'doi' in it
2. We extract the features "Indicators","Geographic Coverage","Spatial Resolution", "Temporal Resolution", and "Temporal Extent" for our metadata extraction process
3. We extract text based response from request for url and preprocess it before feeding it to the model for finetuning

In [4]:

# Access the API key from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")

final_data={}
for each_data in train_data.iterrows():
    if "doi" in each_data[1].iloc[11]:
        final_data[each_data[1].iloc[11]]={"Indicators":each_data[1].iloc[1],
                  "Geographic_Coverage":each_data[1].iloc[4],
                  "Format":each_data[1].iloc[5],
                  "Spatial_Resolution":each_data[1].iloc[7],
                  "Temporal_Resolution":each_data[1].iloc[8],
                  "Temporal_Extent":each_data[1].iloc[9]}
        
for url,value in final_data.items():
        response=requests.get(url)
        html_page = response.text
        soup = BeautifulSoup(html_page, "html.parser")
        text=soup.get_text()
        text=text.lower()
        # Tokenization
        tokens = word_tokenize(text) 
        # Remove punctuation and make lowercase
        tokens = [word.lower() for word in tokens if word.isalnum()]
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        text=" ".join(tokens)
        text = re.sub(r'[\t\n\r\f\v]+', '', text)
        text=re.sub(r'[^\w\s]', '', text)
        final_data[url]["text"]=text


## Training file preparation steps steps:
1. Define a _SYSTEM_PROMPT: This will instruct the gpt model on extraction process. Please go through the details in _SYSTEM_PROMPT to go through the instructions
2. data_format: The train data here should be in .jsonl format. Here, each data is in the form :
   
   {
    "messages": [
   
        {"role": "system", "content": "_SYSTEM_PROMPT"},
   
        {"role": "user", "content": preprocessed text from source url},
   
        {"role": "assistant", "content": "output_format"}
   
    ]
}

This will instruct the gpt model to be aware about different roles as 'system', 'user', and 'assistant' respectively.

3. Create a file at OpenAI for fine-tune purpose

In [5]:
_SYSTEM_PROMPT ="Extract metadata and entities details accurately from my requests." + "Metadata extraction for Indicators should clearly be one of the element in the list ['Disasters','Human Dimensions','Food Availability','Health & Air Quality','Water Availability', 'Extreme Heat','Urban Flooding','Climate Change']. Do not create new element for indicators " + "Geographic_Coverage: If multiple countries, write global, else write name of location"+"Make sure the following fields follow the following regex pattern"+"Spatial_Resolution: ^(\d+(\.\d+)? [a-zA-Z]+|varies|N/A)$"+"Temporal_Resolution: ^(\d+(\.\d+)? [a-zA-Z]+|N/A|varies|weekly|monthly|daily|yearly|varies-multiple datasets included|Daily < Weekly|Hourly < Daily|Weekly < Monthly|Monthly < Yearly|1 minute)$"+"Temporal_Extent: ^(\d{4}-\d{2}-\d{2} to present|present|\d{4}-\d{2}-\d{2} (?:to|until) present|\d{4}-\d{2}-\d{2} to \d{4}-\d{2}-\d{2}|varies(?:- multiple datasets (?:included|available))?)$."
        

train_data=[]
for url,value in final_data.items():
    output_content={"Indicators":value["Indicators"],"Geographic_Coverage":value["Geographic_Coverage"], "Format":value["Format"],"Spatial_Resolution":value["Spatial_Resolution"],"Temporal_Resolution":value["Temporal_Resolution"],"Temporal_Extent":value["Temporal_Extent"]}
    output_content=str(output_content)
    data={"messages": [{"role": "system", "content":_SYSTEM_PROMPT},{"role": "user", "content":f'{value["text"]}'}, {"role": "assistant", "content": output_content}]}
    train_data.append(data)
    
with open("EJ_datasets_finetuning4.jsonl", 'w') as jsonl_file:
    for record in train_data:
        jsonl_file.write(json.dumps(record) + '\n')


client = OpenAI(api_key=openai_api_key)

client.files.create(
  file=open("EJ_datasets_finetuning4.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-DmF8QRHE5l8DLWpZJOp2BIR1', bytes=379274, created_at=1711122028, filename='EJ_datasets_finetuning4.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

# Now, we use the generated fileobject at OpenAI to create a finetuning job using gpt-3.5-turbo-0125 model for 10 epochs


In [6]:

client.fine_tuning.jobs.create(
  training_file="file-DmF8QRHE5l8DLWpZJOp2BIR1", 
  model="gpt-3.5-turbo-0125", 
  hyperparameters={
    "n_epochs":10
  }
)

FineTuningJob(id='ftjob-4KHnDC5jYAVU38XQfZ9DvDsq', created_at=1711122038, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=10, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-KVpMEM9Q8Xm2pc8YuAMYw8r3', result_files=[], status='validating_files', trained_tokens=None, training_file='file-DmF8QRHE5l8DLWpZJOp2BIR1', validation_file=None, user_provided_suffix=None)

# Listing finetuning jobs

In [7]:
client.fine_tuning.jobs.list(limit=1)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-4KHnDC5jYAVU38XQfZ9DvDsq', created_at=1711122038, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:nasa::95blPXsF', finished_at=1711123830, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-KVpMEM9Q8Xm2pc8YuAMYw8r3', result_files=['file-uUDPVa6eHOWoqXJjwDbDzZzN'], status='succeeded', trained_tokens=747980, training_file='file-DmF8QRHE5l8DLWpZJOp2BIR1', validation_file=None, user_provided_suffix=None)], object='list', has_more=True)

### Here, the finetuned job id is: ftjob-4KHnDC5jYAVU38XQfZ9DvDsq and finetuned model is: ft:gpt-3.5-turbo-0125:nasa::95blPXsF. We will use this model for metadata extraction process for Environmental Justice datasets