In [2]:
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from multiprocessing import Pool
import re
from nltk.tokenize import word_tokenize

tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')


In [3]:
def send_batch_request(arguments, model = "gpt-3.5-turbo-instruct", temperature = 0.3, output_format = "lol", columns = ["team", "context", "target"]):
    
    
    chunks = arguments
    
    df = pd.DataFrame()
    flag = []

    try:
    
        response = openai.Completion.create(                                
                    prompt = chunks ,
                    engine = model,
                    temperature = temperature,
                    max_tokens = 2000,  
                    top_p = 1,
                    frequency_penalty = 0,
                    presence_penalty = 0,
                    timeout = 200
                    )
        
    except:
        return(df, 0)
        
    var = response.choices[0].text.strip()
        
    t = pd.DataFrame([[var]], columns=["text"]) 
                    
    df = df.append(t)
    
    return(df, 1)


def string_process_no_stp_words(name):
    
    processed_name = name.strip()
    
    #processed_name = processed_name.replace("_", " ")
    #processed_name = processed_name.replace("-", " ")
    processed_name = processed_name.replace(",", " ")
    
    processed_name_lower = processed_name.lower()
    
    processed_sans_sw = word_tokenize(processed_name)
    processed_sans_sw = [word for word in processed_sans_sw if not word in stp_words]
    abbr = [word[0] for word in processed_sans_sw if not word in stp_words]
    abbr = "".join(abbr)
    processed_sans_sw = " ".join(processed_sans_sw)

    return(processed_name, processed_name_lower, processed_sans_sw, abbr)


def process_team_names(name):
    
    if (len(name) > 0):
    
        #Remove _,- and keep string in uppercase (to preserve some contextual information about proper nouns)
        
        name = string_process_no_stp_words(name)[0]
        #name = name.replace("igem", " ")
        #name = name.replace("team", " ")

        if len(name) > 0:
            return(name.strip())
        

def process_request_regex(text, output_format = "lol", columns = ["team", "matching"]):
    
    if (output_format == "lol"):
        
        var = text
        
#        try:
            
#            var = var[var.find("[["):var.find("]]")+2]
#            var = ast.literal_eval(var)
            
#            processed_df = pd.DataFrame(data = var, columns = columns)
        
#        except:
            
        try:

            var = text

            var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 2 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)


In [4]:
version = "two-step"
output_format = "lol"
temperature_curation = 0.3

#batches = os.listdir("../data/processed/batches_inter-team_collab/")
#batches = os.listdir("../data/processed/all_batches_inter-team_fuzzy_select_freq/")
#batches = [batch.replace(".csv", "") for batch in batches]

batches = ["1", "2"]

model = "gpt-3.5-turbo-instruct"

temperature = 0.3

breaks = 10

meta = pd.read_csv("../data/raw/team_meta.csv")


In [5]:
#Prompt

prompt = """

Match each relationship name to the closest possible category listed below:

Provide each matching as [[RELATIONSHIP, MATCHING CATEGORY]]

The possible categories are:

"work": Teams worked together or collaborated on aspects of their projects
"material transfer": One team shared information, data, synthetic biology parts or laboratory materials with the other. 
"advice": One team gave advice or support to the other team concerning their project or about the competition.
"meetup": Teams met each other at a meetup or in a social setting and discussed their project.
"other": Contexts not fitting to the categories listed above.

example: "provided substrates to, participated in synthetic biology day with, did pct amplification for"
Matching: [["provided substrates to", "material transfer"], ["participated in synthetic biology day with","meetup"], ["did pcr amplification for","work"]]

example: "provided thoughts and suggestions to"
Matching: [["provided thoughts and suggestions to","advice"]]

The following are the list of relationships to match:

"""


In [6]:
df_names = pd.DataFrame()

for batch_no in batches:
    
    df = pd.read_csv("../data/processed/gpt_curated/inter-team/processed_" + str(batch_no) + "_" + version + "_" + model + "_" + str(temperature_curation).replace(".","-") + ".csv")
    
    df_names = df_names.append(df[["context", "year"]])
    

In [None]:
df_matching = pd.DataFrame()

for year in sorted(df_names['year'].unique()):
    
    start = time.time()
    print("Started year " + str(year))
    
    missed_entries = []
    
    temp = df_names[df_names['year'] == year] 
    
    l = list(temp['context'].unique())
    l = [x for x in l if x == x]
    
    all_messages = []
    var = 0
    
    #batch['processed_text'] =  prompt + "We are team " + batch['team'] + "\n." + "The following text describes our activities in a scientific competition called iGEM including our relationships with other teams:" +  batch['text']
    
    for j in range(0,len(l),breaks):        
        
        text = prompt +  ', '.join(l[j:min(j+breaks,len(l))])
        
        all_messages.extend([text])
        var = var + 1
    
    for message in all_messages:
        
        output, miss = send_batch_request(message)
        output['year'] = year
        
        df_matching = df_matching.append(output)
        
        if miss == 0:
            missed_entries.extend(all_messages[miss][0])
    
    #After support for the old endpoint ended early 2024, parallel processing seems to fail much more than before.
        
    #with Pool(processes = 8) as pool:
        
    #    zipped_output = pool.map(send_request_chat_competion, all_messages)
        
    #    for output, miss in list(zipped_output):
            
    #        df_matching = df_matching.append(output)
            
    #        if not miss == None:
                
    #            missed_entries.extend(all_messages[miss][0])
            
        time.sleep(0.01)
            
            
    end = time.time()
    time_elapsed = (end-start)
    print("finished year " + str(year) + " in " + str(time_elapsed) + " seconds")
    
    stats = stats.append(pd.DataFrame([[year, time_elapsed, str(missed_entries)]], columns=["year", "time_elapsed", "failed_chunks"]))
    time.sleep(1)    
    

Started year 2009
finished year 2009 in 1.4602768421173096 seconds
Started year 2011
finished year 2011 in 0.9632911682128906 seconds
Started year 2012
finished year 2012 in 0.7175722122192383 seconds
Started year 2013
finished year 2013 in 1.4542913436889648 seconds
Started year 2014
finished year 2014 in 1.2094769477844238 seconds
Started year 2015
finished year 2015 in 14.971787452697754 seconds
Started year 2016
finished year 2016 in 26.7225284576416 seconds
Started year 2017


In [7]:
df = pd.DataFrame()
columns = ["context", "category"]
stats_processing = pd.DataFrame()

for i in range(len(df_matching)):
    
    var = df_matching['text'].iloc[i]
    
            
    try:

        var = df_matching['text'].iloc[i]

        var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
        
        var = [ast.literal_eval(x) for x in var]

        var = [x if len(x) == 2 else None for x in var]
        var = [x for x in var if x is not None]

        if len(var) == 0:
            processed_df = pd.DataFrame()
            stats_processing = stats_processing.append(pd.DataFrame([[i, 2]], columns = ["row_no", "status"]))

        processed_df = pd.DataFrame(var, columns = columns)   
                
    except:
                
        processed_df = pd.DataFrame()
        stats_processing = stats_processing.append(pd.DataFrame([[i, 0]], columns = ["row_no", "status"]))
            
    if len(processed_df) > 0:
        df = df.append(processed_df)
        stats_processing = stats_processing.append(pd.DataFrame([[i, 1]], columns = ["row_no", "status"]))

In [8]:
df_matching.to_csv("../data/processed/curated/collab_inter-team_versions/context_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
stats.to_csv("../data/processed/curated/collab_inter-team_versions/stats_context_matching_" + version + "_" + model+ "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
stats_processing.to_csv("../data/processed/curated/collab_inter-team_versions/stats_context_matching_processing_" + version + "_" + model+ "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
df.to_csv("../data/processed/curated/collab_inter-team_versions/processed_context_matching_" + version + "_" + model+ "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)