In [5]:
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from multiprocessing import Pool
import re
from nltk.tokenize import word_tokenize


tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')


In [6]:
def send_batch_request(arguments, model = "gpt-3.5-turbo-instruct", temperature = 0.3, output_format = "lol", columns = ["team", "context", "target"]):
    
    
    chunks = arguments
    
    df = pd.DataFrame()
    flag = []

    try:
    
        response = openai.Completion.create(                                
                    prompt = chunks ,
                    engine = model,
                    temperature = temperature,
                    max_tokens = 2000,  
                    top_p = 1,
                    frequency_penalty = 0,
                    presence_penalty = 0,
                    timeout = 200
                    )
        
    except:
        return(df, 0)
        
    var = response.choices[0].text.strip()
        
    t = pd.DataFrame([[var]], columns=["text"]) 
                    
    df = df.append(t)
    
    return(df, 1)


def string_process_no_stp_words(name):
    
    processed_name = name.strip()
    
    #processed_name = processed_name.replace("_", " ")
    #processed_name = processed_name.replace("-", " ")
    processed_name = processed_name.replace(",", " ")
    
    processed_name_lower = processed_name.lower()
    
    processed_sans_sw = word_tokenize(processed_name)
    processed_sans_sw = [word for word in processed_sans_sw if not word in stp_words]
    abbr = [word[0] for word in processed_sans_sw if not word in stp_words]
    abbr = "".join(abbr)
    processed_sans_sw = " ".join(processed_sans_sw)

    return(processed_name, processed_name_lower, processed_sans_sw, abbr)


def process_team_names(name):
    
    if (len(name) > 0):
    
        #Remove _,- and keep string in uppercase (to preserve some contextual information about proper nouns)
        
        name = string_process_no_stp_words(name)[0]
        #name = name.replace("igem", " ")
        #name = name.replace("team", " ")

        if len(name) > 0:
            return(name.strip())
        

def process_request_regex(text, output_format = "lol", columns = ["team", "matching"]):
    
    if (output_format == "lol"):
        
        var = text
        
#        try:
            
#            var = var[var.find("[["):var.find("]]")+2]
#            var = ast.literal_eval(var)
            
#            processed_df = pd.DataFrame(data = var, columns = columns)
        
#        except:
            
        try:

            var = text

            var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 2 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)


In [7]:
version = "two-step"
output_format = "lol"
temperature_curation = 0.3

#batches = os.listdir("../data/processed/batches_inter-team_collab/")
#batches = os.listdir("../data/processed/all_batches_inter-team_fuzzy_select_freq/")
#batches = [batch.replace(".csv", "") for batch in batches]

batches = ["1", "2"]

model = "gpt-3.5-turbo-instruct"

temperature = 0.3

breaks = 10

meta = pd.read_csv("../data/raw/team_meta.csv")

f = open("prompts/prompt_inter-team_matching_" + output_format, "r")
prompt = f.read().rstrip() 

In [8]:

df_names = pd.DataFrame()

for batch_no in batches:
    
    df = pd.read_csv("../data/processed/gpt_curated/inter-team/processed_" + batch_no + "_" + version + "_" + model + "_" + str(temperature_curation).replace(".","-") + ".csv")
    
    df_names = df_names.append(df[["team", "year"]])
    
    temp = pd.DataFrame({"team":df['target'], "year":df['year']})
    
    df_names = df_names.append(temp)

In [9]:
df_matching = pd.DataFrame()

for year in sorted(df_names['year'].unique()):
    
    start = time.time()
    print("Started year " + str(year))
    
    missed_entries = []
    
    temp = df_names[df_names['year'] == year] 
    
    list_of_names = list(meta[meta['Year'] == year]['Team'])
    list_of_names_edited = [process_team_names(str(name)) for name in list_of_names]
    
    l = list(temp['team'].unique())
    
    #l = [str(x).split(",") for x in l]
    #l = sum(l, [])
    
    #l = [str(element) for element in l if element not in list_of_names]
    
    l = [str(element) for element in l if not any(element == name for name in list_of_names)]
    
    l_edited = [process_team_names(str(element)) for element in l]
    l_edited = [i for i in l_edited if i is not None]
    
    all_messages = []
    var = 0
    
    #batch['processed_text'] =  prompt + "We are team " + batch['team'] + "\n." + "The following text describes our activities in a scientific competition called iGEM including our relationships with other teams:" +  batch['text']
    
    for j in range(0,len(l),breaks):        
        
        text = "The following list contains the names of teams: " + ", ".join(list_of_names_edited) + prompt + ', '.join(l_edited[j:min(j+breaks,len(l_edited))])
        
        all_messages.extend([text])
        var = var + 1
    
    for message in all_messages:
        
        output, miss = send_batch_request(message)
        output['year'] = year
        
        df_matching = df_matching.append(output)
        
        if miss == 0:
            missed_entries.extend(all_messages[miss][0])
            
   #After support for the old endpoint ended early 2024, parallel processing seems to fail much more than before. 
    
    #with Pool(processes = 8) as pool:
        
    #    zipped_output = pool.map(send_request_chat_competion, all_messages)
        
    #    for output, miss in list(zipped_output):
            
    #        df_matching = df_matching.append(output)
            
    #        if not miss == None:
                
    #            missed_entries.extend(all_messages[miss][0])
            
        time.sleep(0.01)
            
            
    end = time.time()
    time_elapsed = (end-start)
    print("finished year " + str(year) + " in " + str(time_elapsed) + " seconds")
    
    stats = stats.append(pd.DataFrame([[year, time_elapsed, str(missed_entries)]], columns=["year", "time_elapsed", "failed_chunks"]))
    time.sleep(1)    
    

Started year 2009
finished year 2009 in 3.125364303588867 seconds
Started year 2011
finished year 2011 in 20.414579153060913 seconds
Started year 2012
finished year 2012 in 2.444157123565674 seconds
Started year 2013
finished year 2013 in 3.0469248294830322 seconds
Started year 2014
finished year 2014 in 1.4547080993652344 seconds
Started year 2015
finished year 2015 in 31.438679933547974 seconds
Started year 2016
finished year 2016 in 38.56688833236694 seconds
Started year 2017
finished year 2017 in 61.172534465789795 seconds
Started year 2018
finished year 2018 in 53.55965065956116 seconds


In [11]:
df_matching.to_csv("../data/processed/gpt_curated/inter-team/team_name_matchings_" + str(breaks) + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-")+ ".csv", index = False)
stats.to_csv("../data/processed/gpt_curated/inter-team/stats_team_name_matchings_" + str(breaks) + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)

In [12]:
stats_processing = pd.DataFrame()

df_interactions = pd.DataFrame()
    
df = pd.read_csv("../data/processed/gpt_curated/inter-team/team_name_matchings_" + str(breaks) + "_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv")
    
for loop_var in range(len(df)):
    
    t, flag = process_request_regex(str(df['text'][loop_var]))
        
    if (len(t) > 0):
        
        t['year'] = df['year'][loop_var]
            
        df_interactions = df_interactions.append(t)                
        
    stats_processing = stats_processing.append(pd.DataFrame([[loop_var, flag]], columns=["row_no", "status"]))
    
df_interactions.to_csv("../data/processed/gpt_curated/inter-team/processed_team_name_matchings_" + str(breaks) + "_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv", index=False)
