In [23]:
#Load the necessary libraries
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from multiprocessing import Pool
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#from ipynb.fs.full.gpt_curation_custom_functions import send_batch_request
#from ipynb.fs.full.gpt_curation_custom_functions import string_process_no_stp_words, process_team_names
#from ipynb.fs.full.gpt_curation_custom_functions import fuzzy_match, fuzzy_search

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')

In [24]:

#Version is custom - based on my prompt styles. Specific is one-shot with particular explanation on the task
version = "specific"

#Setting the output format as a list of list
output_format = "lol"


#The text file to the prompt
f = open("prompts/prompt_intra-team_version_" + version + "_" + output_format , "r")
prompt = f.read().rstrip()

meta = pd.read_csv("../data/raw/team_meta.csv")

model = "gpt-4-turbo"
#model = "gpt-4-0125-preview"
#model = "gpt-3.5-turbo-16k"


#The level of creativity in the gpt response - for the extraction task set at 0.3
temperature = 0.3


#years = list(range(2015, 2019))
years = ["2015", "2016", "2017", "2018"]

manual = pd.read_csv("../data/processed/manually_curated/random_per_year_intra_team/new_intra_team_annotation_partial.csv")

In [25]:
prompt = """

Extrapolate all relationships between people and activities they were involved in from the text provided. 

Provide the output as a list of lists - [["PERSON", "ACTIVITY"]]. 

"PERSON" is the name of the person.
"ACTIVITY" is the activity they performed as described in the text.

If a person is involved in multiple activities, report each activity as a separate relationship.

example: "Project colycyclin: Marcus North, Andrea Belotti"
relationships: [["Marcus North", "Project colycyclin"],["Andrea Belotti", "Project colycyclin"]]

example: "We are team Mideastern. Our team members Brad Hogg and Sophie Devine performed wet lab experiments. Andrew Symonds was responsible for collaborating with other iGEM teams and maintaining our wiki."
relationships: [["Brad Hogg","performed wet lab experiments"],["Sophie Devine","performed wet lab experiments"],["Andrew Symonds","responsible for collaborating with other teams"],["Andrew Symonds", "maintaining our wiki"]]

Please make sure there is no additional text in the response other than the relationships in the prescribed format.

The text: 

"""


In [26]:
def compute_n_tokens(text, tokenizer = tokenizer):
    
    t = tokenizer.encode(text)
    
    return(len(t))


def send_request_chat_competion(arguments, model = "gpt-4-turbo", temperature = 0.3, output_format = "lol"):
    
    message = arguments[0]
    variables = arguments[1]
    chunk = arguments[2]
    
    try:
        response = openai.ChatCompletion.create(model = model, messages = message, temperature = temperature, request_timeout = 60)
        final_response = response.choices[0].message["content"].strip()
    
    except:
        return(pd.DataFrame(), chunk)
    
    if (output_format == "lol"):
        
        var = final_response
        processed_df = pd.DataFrame([[var]], columns=["text"])
            
        for key, value  in variables.items():
            processed_df[key] = value

    return(processed_df, None)

def process_request_regex(text, output_format = "lol", columns = ["member", "activity"]):
    
    if (output_format == "lol"):
        
        var = text
            
        try:

            var = text

            var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 2 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)


In [27]:

for year in years:
    
    start = time.time()
    year = str(year)

    teams = manual['Team'][manual['Year'] == int(year)].unique()
    
    all_messages = []
    missed_entries = []
    var = 0
    df = pd.DataFrame()
    
    text = pd.read_csv("../data/processed/text_batches_intra-team/" + year + ".csv")
    
    for team in teams:
    
        if (not os.path.isfile("")):
                   
            t = text[text['team'] == team]
            
            for i in range(len(t)):
                
                processed_text = t['chunk'].iloc[i]
                
                messages = [{"role": "user", "content": prompt}]
                messages.append({"role": "user", "content": processed_text})              
                all_messages.extend([[messages, {"year": year, "team": team, 'page': t['page'].iloc[i]}, var]])
                var = var + 1
    
    
    with Pool(processes = 8) as pool:

        zipped_output = pool.map(send_request_chat_competion, all_messages)

        for output, miss in list(zipped_output):    
            
            df = df.append(output)
        
            if not miss == None:
                missed_entries.extend(all_messages[miss][0])

    end = time.time()
    time_elapsed = (end-start)

    stats = stats.append(pd.DataFrame([[year, time_elapsed, missed_entries]], columns=["year", "time_secs", "misses"]))

    print("Completed year " + year + " in " + str(time_elapsed) + " seconds")

    
    df.to_csv("../data/processed/gpt_curated/intra-team/raw_" + year + "_"+ version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)
    time.sleep(1)
    
                  

Completed year 2015 in 81.67648911476135 seconds
Completed year 2016 in 103.7580053806305 seconds
Completed year 2017 in 75.94142413139343 seconds
Completed year 2018 in 75.35301470756531 seconds


In [28]:
stats_processing = pd.DataFrame()

for year in years:
    
    df_interactions = pd.DataFrame()
    
    df = pd.read_csv("../data/processed/gpt_curated/intra-team/raw_" + str(year) + "_"+ version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")
    
    if (len(df) > 0):

        for loop_var in range(len(df)):

            t, flag = process_request_regex(str(df['text'].iloc[loop_var]))

            if (len(t) > 0):

                t['team'] = df['team'].iloc[loop_var]
                t['year'] = year
                t['page'] = df['page'].iloc[loop_var]
                #t['batch_id'] = batch_no
                t['chunk_id'] = loop_var #df['chunk'][loop_var]

                df_interactions = df_interactions.append(t)                

            stats_processing = stats_processing.append(pd.DataFrame([[year, loop_var, flag]], columns=["year", "chunk_no", "status"]))

        df_interactions.to_csv("../data/processed/gpt_curated/intra-team/processed_" + str(year) + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)
        
    print("Completed year " + str(year))

Completed year 2015
Completed year 2016
Completed year 2017
Completed year 2018


In [18]:
df = pd.DataFrame()

years = ["2015", "2016", "2017", "2018"]

for year in years:
    
    t = pd.read_csv("../data/processed/gpt_curated/intra-team/processed_" + year + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")
    df = df.append(t)
    
df.to_csv("../data/processed/gpt_curated/intra-team/processed_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")

Unnamed: 0,year,time_secs,misses
0,2015,81.676489,"[{'role': 'user', 'content': ' Extrapolate al..."
0,2016,103.758005,"[{'role': 'user', 'content': ' Extrapolate al..."
0,2017,75.941424,"[{'role': 'user', 'content': ' Extrapolate al..."
0,2018,75.353015,[]


In [21]:
len(stats['misses'].iloc[0])

28