## relation extraction via linking phrase parsing

In [15]:
import os
import re
import torch
from torch import nn
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import backoff
import time

openai.api_key = '' # put your key here

In [6]:
def match_sentence_fragments(phrase1, phrase2, events):
    vectorizer = CountVectorizer().fit([phrase1, phrase2]+events)
    events1, events2 = [], []
    phrase1_vector = vectorizer.transform([phrase1])
    phrase2_vector = vectorizer.transform([phrase2])
    for event in events:
        event_vector = vectorizer.transform([event])
        if cosine_similarity(event_vector, phrase1_vector) > cosine_similarity(event_vector, phrase2_vector) :
            events1.append(event)
        else:
            events2.append(event)
    return events1, events2


def extract_events_from_llama_labeled_events(string):
    # Use a regular expression to find the desired text
    matches = re.findall(r'\d+\.\s*(.*?)\s*(?=\d+\.|$)', string, re.DOTALL)
    # Remove any empty or whitespace-only matches
    matches = [match.strip(" \n.#") for match in matches if match.strip()]
    return matches


def prompt_chatgpt(prompt):
    try:
        completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content":prompt}
        ])
    except Exception as e:
        print(str(e))
        time.sleep(6)
        completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content":prompt}
        ])
    return completion["choices"][0]["message"]["content"]

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def prompt_chatgpt_with_backoff(prompt):
    return prompt_chatgpt(prompt)


def cause_classification_via_chatgpt(sentence, cause_candidates, effect_candidates):
    results = []
    for i in range(len(cause_candidates)):
        for j in range(len(effect_candidates)):
            cause = cause_candidates[i].replace("\n","")
            effect = effect_candidates[j].replace("\n","")
            prompt = f" Given the sentence {sentence}, can we infer that {cause} is a cause of {effect} ? Answer Yes or No."  
            response = prompt_chatgpt_with_backoff(prompt)
            if response.strip("\n").strip(".").strip().lower() == "yes":
                results.append(f"{cause} ==> {effect}")
    return "\n".join(results)


In [None]:
import pandas as pd
import re
from tqdm import tqdm

cause_words = ['because', 'because of', 'owing to', 'due to', 'caused by']
effect_words = ['so', 'therefore', 'hence', 'thus', 'as a result', ' as a consequence', 'consequently', 'resulting in', 'resulted in']
cause_list = []
effect_list = []

files = os.listdir("../Corpus/llama_labeled_events")
for file in tqdm(files):
    wiki_page = file.split(".")[0]
    print("Processing wiki page:",wiki_page)
    try:
        df = pd.read_csv(f"../Corpus/llama_labeled_events/{file}",index_col=0)
    except UnicodeDecodeError:
        continue
    if len(df)==0:
        continue
    df_causes_filtered = df[df['sentence'].str.contains('[Bb]ecause|[Bb]ecause of|[Oo]wing to|[Dd]ue to|[Cc]aused by', regex=True)]
    df_effects_filtered = df[df['sentence'].str.contains(' [Ss]o |[Tt]herefore|[Hh]ence |[Tt]hus |[Aa]s a result|[Aa]s a consequence|[Cc]onsequently|[Rr]esulting in|[Rr]esulted in', regex=True)]
    

    for row in df_causes_filtered.iterrows():
        sentence = row[1].sentence.lower()
        events = extract_events_from_llama_labeled_events(row[1].llama_labeled_events)

        for word in cause_words:
            if word in sentence:
                try:
                    phrase1, phrase2 = sentence.split(word)
                except:
                    continue
                events1, events2 = match_sentence_fragments(phrase1, phrase2, events)
                
                if len(events1) == 0 or len(events2) == 0:
                    continue

                # # extract the cause-effect pair by rule
                # if len(events1) == 1 and len(events2) == 1:
                #     causes_inference_by_rule = f"{events2[0]} ==> {events1[0]}" 
                # else:
                #     causes_inference_by_rule = ""

                # extract the cause-effect pair by ChatGPT
                causes_inference_by_ChatGPT = cause_classification_via_chatgpt(sentence, cause_candidates = events2, effect_candidates = events1)

                # store the results into the list
                if causes_inference_by_ChatGPT == "":
                    continue
                for cause_effect_pair in causes_inference_by_ChatGPT.split("\n"):
                    cause, effect = cause_effect_pair.split(" ==> ")
                    cause_list.append({
                        "wiki_page": wiki_page,
                        "sentence": sentence,
                        "cause": cause,
                        "effect": effect,
                        "ChatGPT_label":1,
                    })

    for row in df_effects_filtered.iterrows():
        sentence = row[1].sentence.lower()
        events = extract_events_from_llama_labeled_events(row[1].llama_labeled_events)
        
        for word in effect_words:
            if word in sentence:
                try:
                    phrase1, phrase2 = sentence.split(word)
                except:
                    continue
                events1, events2 = match_sentence_fragments(phrase1, phrase2, events)

                if len(events1) == 0 or len(events2) == 0:
                    continue

                # # extract the cause-effect pair by rule
                # if len(events1) == 1 and len(events2) == 1:
                #     effect_inference_by_rule = f"{events1[0]} ==> {events2[0]}" 
                # else:
                #     effect_inference_by_rule = ""

                # extract the cause-effect pair by ChatGPT
                effect_inference_by_ChatGPT = cause_classification_via_chatgpt(sentence, cause_candidates = events1, effect_candidates = events2)

                # store the results into the list
                if effect_inference_by_ChatGPT == "":
                    continue
                for cause_effect_pair in effect_inference_by_ChatGPT.split("\n"):
                    cause, effect = cause_effect_pair.split(" ==> ")
                    effect_list.append({
                        "wiki_page": wiki_page,
                        "sentence": sentence,
                        "cause": cause,
                        "effect": effect,
                        "ChatGPT_label":1,
                    })
    
df_merged = pd.DataFrame(cause_list + effect_list)
df_merged.to_csv("../Dataset/EconNLI_train_tmp.csv")