In [11]:
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import numpy as np
import ast
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from multiprocessing import Pool
import re
from nltk.tokenize import word_tokenize

#from ipynb.fs.full.gpt_curation_custom_functions import string_process_no_stp_words, process_team_names
#from ipynb.fs.full.gpt_curation_custom_functions import fuzzy_match, fuzzy_search

tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')

In [12]:
def send_request_chat_competion(arguments, model = "gpt-3.5-turbo-16k", temperature = 0.3, output_format = "lol", columns = ["team", "matching"]):
    
    message = arguments[0]
    variables = arguments[1]
    chunk = arguments[2]
    
    try:
        response = openai.ChatCompletion.create(model = model, messages = message, temperature = temperature, request_timeout = 60)
        final_response = response.choices[0].message["content"].strip()
    
    except:
        return(pd.DataFrame(), chunk)
    
    if (output_format == "lol"):
        
        var = final_response
        processed_df = pd.DataFrame([[var]], columns=["text"])
            
        for key, value  in variables.items():
            processed_df[key] = value

    return(processed_df, None)


In [37]:
version = "two-step"
output_format = "lol"
temperature_curation = 0.9

#batches = os.listdir("../data/processed/text_batches_inter-team_filtered/")
#batches = [batch.replace(".csv", "") for batch in batches]

batches = ["1", "2"]

model = "gpt-3.5-turbo-16k"
#model = "gpt-4-0125-preview"

temperature = 0.3

breaks = 10

meta = pd.read_csv("../data/raw/team_meta.csv")


In [38]:

df_names = pd.DataFrame()

for batch_no in batches:
    
    try:
        
        df = pd.read_csv("../data/processed/gpt_curated/inter-team/processed_" + str(batch_no) + "_" + version + "_" + model + "_" + str(temperature_curation).replace(".","-") + ".csv")
    
        df_names = df_names.append(df[["context", "year"]])
    
    except:
        
        continue

In [39]:
#Prompt

prompt = """

Match each relationship name to the closest possible category listed below:

Provide each matching as [[RELATIONSHIP, MATCHING CATEGORY]]

The possible categories are:

"work": Teams worked together or collaborated on aspects of their projects
"material transfer": One team shared information, data, synthetic biology parts or laboratory materials with the other. 
"advice": One team gave advice or support to the other team concerning their project or about the competition.
"meetup": Teams met each other at a meetup or in a social setting and discussed their project.
"other": Contexts not fitting to the categories listed above.

example: "provided substrates to, participated in synthetic biology day with, did pct amplification for"
Matching: [["provided substrates to", "material transfer"], ["participated in synthetic biology day with","meetup"], ["did pcr amplification for","work"]]

example: "provided thoughts and suggestions to"
Matching: [["provided thoughts and suggestions to","advice"]]

The following are the list of relationships to match:

"""


In [40]:
df_matching = pd.DataFrame()

for year in sorted(df_names['year'].unique()):
    
    start = time.time()
    print("Started year " + str(year))
    
    missed_entries = []
    
    temp = df_names[df_names['year'] == year] 
    
    
    l = list(temp['context'].unique())
    l = [x for x in l if x == x] # To remove np.nan. Regular method seems to not work
    
    all_messages = []
    var = 0
    
    for j in range(0,len(l),breaks):
    
        messages = [{"role": "user", "content": prompt}]
        messages.append({"role": "user", "content": ', '.join(l[j:min(j+breaks,len(l))])})
        
        all_messages.extend([[messages, {"year": year}, var]])
        var = var + 1
    
    with Pool(processes = 8) as pool:
        
        zipped_output = pool.map(send_request_chat_competion, all_messages)
        
        for output, miss in list(zipped_output):
            
            output['year'] = year
            df_matching = df_matching.append(output)
            
            if not miss == None:
                
                missed_entries.extend(all_messages[miss][0])
            
        time.sleep(0.01)
            
            
    end = time.time()
    time_elapsed = (end-start)
    print("finished year " + str(year) + " in " + str(time_elapsed) + " seconds")
    
    stats = stats.append(pd.DataFrame([[year, time_elapsed, str(missed_entries)]], columns=["year", "time_elapsed", "failed_chunks"]))
    time.sleep(1)    
    

Started year 2009
finished year 2009 in 1.991706132888794 seconds
Started year 2011
finished year 2011 in 1.1019747257232666 seconds
Started year 2012
finished year 2012 in 0.9438886642456055 seconds
Started year 2014
finished year 2014 in 1.3459153175354004 seconds
Started year 2015
finished year 2015 in 6.0601646900177 seconds
Started year 2016
finished year 2016 in 4.832598924636841 seconds
Started year 2017
finished year 2017 in 6.0291595458984375 seconds
Started year 2018
finished year 2018 in 5.185008764266968 seconds


In [41]:
df = pd.DataFrame()
columns = ["context", "category"]
stats_processing = pd.DataFrame()

for i in range(len(df_matching)):
    
    var = df_matching['text'].iloc[i]
            
    try:

        var = df_matching['text'].iloc[i]

        var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
        var = [ast.literal_eval(x) for x in var]

        var = [x if len(x) == 2 else None for x in var]
        var = [x for x in var if x is not None]

        if len(var) == 0:
            processed_df = pd.DataFrame()
            stats_processing = stats_processing.append(pd.DataFrame([[i, 2]], columns = ["row_no", "status"]))

        processed_df = pd.DataFrame(var, columns = columns)   
                
    except:
                
        processed_df = pd.DataFrame()
        stats_processing = stats_processing.append(pd.DataFrame([[i, 0]], columns = ["row_no", "status"]))
            
    if len(processed_df) > 0:
        df = df.append(processed_df)
        stats_processing = stats_processing.append(pd.DataFrame([[i, 1]], columns = ["row_no", "status"]))

In [42]:
df_matching.to_csv("../data/processed/gpt_curated/inter-team/context_matching_" + version + "_" + breaks + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
stats.to_csv("../data/processed/gpt_curated/inter-team/stats_context_matching_" + version + "_" + breaks + "_" +model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
stats_processing.to_csv("../data/processed/gpt_curated/inter-team/stats_context_matching_processing_" + version + "_" + breaks + "_" + model+ "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
df.to_csv("../data/processed/gpt_curated/inter-team/processed_context_matching_" + version + "_" + breaks + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)