In [45]:
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import ast


# Configurate OpenAI Credentials

In [46]:
load_dotenv(find_dotenv())

SECRET_KEY = os.environ.get("SECRET_KEY")
DATABASE_PASSWORD = os.environ.get("DATABASE_PASSWORD")

OPENAI_SECRET_KEY = os.getenv('OPENAI_SECRET_KEY')
# OPENAI_SECRET_KEY = "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"
os.environ["OPENAI_API_KEY"] = OPENAI_SECRET_KEY

client = OpenAI() 

# Prepare Data

In [47]:
df = pd.read_excel('data.xlsx').iloc[1:99,:]

In [48]:
df.head()

Unnamed: 0,all_annotations_cleaned,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
1,annotator_10,607-Spoorwegwet,De ontheffing of vrijstelling kan onder beperk...,"['De', 'ontheffing', 'of', 'vrijstelling', 'ka...","['Object', 'Object', 'Object', 'Object', 'Acti...",DS,,NOT_AVAILABLE_THIS_ROUND,False,Spoorwegwet
2,annotator_14,326-Wet_rechterlijke_organisatie,Degene die zitting heeft in de enkelvoudige ka...,"['Degene', 'die', 'zitting', 'heeft', 'in', 'd...","['Actor', 'Actor', 'Actor', 'Actor', 'Actor', ...",DS,,NOT_AVAILABLE_THIS_ROUND,False,Wet_rechterlijke_organisatie
3,annotator_3,536-Politiewet_2012,De examinatoren verstrekken de examencommissie...,"['De', 'examinatoren', 'verstrekken', 'de', 'e...","['Actor', 'Actor', 'Action', 'Recipient', 'Rec...",DS,,NOT_AVAILABLE_THIS_ROUND,False,Politiewet_2012
4,annotator_13,400-Vreemdelingenwet_2000,Indien de aanvraag tot het verlenen van een ve...,"['Indien', 'de', 'aanvraag', 'tot', 'het', 've...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",DS,,NOT_AVAILABLE_THIS_ROUND,False,Vreemdelingenwet_2000
5,annotator_14,406-Wet_op_de_jeugdzorg,"De griffier zendt , onverminderd , een afsc...","['De', 'griffier', 'zendt', ',', 'onverminderd...","['Actor', 'Actor', 'Action', ',', 'O', ',', 'O...",DS,Omdat er hierna een dubbele punt komt weet ik ...,NOT_AVAILABLE_THIS_ROUND,False,Wet_op_de_jeugdzorg


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 1 to 98
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   all_annotations_cleaned  98 non-null     object
 1   Unnamed: 1               98 non-null     object
 2   Unnamed: 2               98 non-null     object
 3   Unnamed: 3               98 non-null     object
 4   Unnamed: 4               98 non-null     object
 5   Unnamed: 5               98 non-null     object
 6   Unnamed: 6               2 non-null      object
 7   Unnamed: 7               98 non-null     object
 8   Unnamed: 8               98 non-null     object
 9   Unnamed: 9               98 non-null     object
dtypes: object(10)
memory usage: 7.8+ KB


In [54]:
# select 4th column
tokenized_sentences = df["Unnamed: 3"]
labels = df["Unnamed: 4"]
# select first row

sentence = ast.literal_eval(tokenized_sentences[1])
label = ast.literal_eval(labels[1])



# Create Prompts

In [55]:
# read txt file. Do not use with open
prompt1 = open('prompt_1.txt', 'r').read()

# Predict / Label 

In [18]:
class SemanticRoleLabeler:
  def __init__(self, prompt: str):
    self._seed = 42 # ensure reproducable results
    self._temperature = 0.0 # ensure deterministic results
    self._client = OpenAI() 
    self._model = "gpt-3.5-turbo-1106" 
    # self._model="gpt-4-1106-preview", # this is expensive
    self.system_prompt = prompt   
    
    
  # MARK: - Public Methods
  def label(self, sentence) -> [str]:
    response = self._computeReponse(sentence)
    return self._extract_message_from_response(response)
    
  # MARK: - Private Methods
  def _computeReponse(self, sentence):
     response = self._client.chat.completions.create(
      seed=self._seed, 
      temperature=self._temperature, 
      model=self._model,
      # response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": self.system_prompt},
        {"role": "user", "content": sentence},
      ],
      # stream=True,
    )
     return response
   
   
     
  def _extract_message_from_response(self, response) -> str:
    return response.choices[0].message.content


In [56]:
Labeler = SemanticRoleLabeler(prompt=prompt1)


In [57]:
Labeler.label(str(sentence))

'Het werkwoord in de zin is "verleend".\n\nDe zin formuleert een preconditie voor de actie "verleend". De preconditie valt onder label 10, namelijk de contextuele omstandigheden die moeten zijn voldaan voordat de handeling geldig wordt.'

In [62]:
sentence

['De',
 'ontheffing',
 'of',
 'vrijstelling',
 'kan',
 'onder',
 'beperkingen',
 'worden',
 'verleend',
 '.']

In [64]:
label

['Object',
 'Object',
 'Object',
 'Object',
 'Action',
 'O',
 'O',
 'Action',
 'Action',
 '.']

# Visualize

In [69]:
import spacy
from spacy import displacy
from spacy.tokens import Span

class SpacyLabelVisualizer:
    def __init__(self, label_color_dict: dict, nil_label: str = "O"):
        self.nil_label = nil_label
        self._label_color_dict = label_color_dict
        self._possible_labels = list(label_color_dict.keys())
        
    def create_displacy_dict(self, words, labels):
        # Join the words into a single string (sentence)
        sentence = ' '.join(words)

        # Initialize the list of entities
        entities = []
        start = 0

        for word, label in zip(words, labels):
            end = start + len(word)  # Calculate end index of the word
            if label != self.nil_label:  # Assuming 'O' is the label for non-entities
                entities.append({"start": start, "end": end, "label": label})
            start = end + 1  # Update start index for next word (+1 for space)

        # Create the dictionary in the required format
        displacy_dict = {
            "text": sentence,
            "ents": entities,
            "title": None
        }

        return displacy_dict    
    
    def display(self, words, labels):
        displacy_dict = self.create_displacy_dict(words, labels)
        colors = self._label_color_dict
        html = displacy.render(displacy_dict, style="ent", manual=True, options={"colors": colors})
        return html



visualizer = SpacyLabelVisualizer(label_color_dict={"Object": "aquamarine", "O": "tomato"})
visualizer.display(sentence, label)




In [26]:
sentence

"['De', 'ontheffing', 'of', 'vrijstelling', 'kan', 'onder', 'beperkingen', 'worden', 'verleend', '.']"

In [None]:


# # Example usage
# words = ["But", "Google", "is", "starting", "from", "behind"]
# labels = ["", "ORG", "O", "O", "O", "O"]

# result_dict = create_displacy_dict(words, labels)

# html = displacy.render(result_dict, style="ent", manual=True)


# Compute Confusion Matrices