In [5]:
#Load the necessary libraries
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from multiprocessing import Pool

#from ipynb.fs.full.gpt_curation_custom_functions import send_batch_request
#from ipynb.fs.full.gpt_curation_custom_functions import string_process_no_stp_words, process_team_names
#from ipynb.fs.full.gpt_curation_custom_functions import fuzzy_match, fuzzy_search

tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')

In [6]:

#Version is custom - based on my prompt styles. Specific is one-shot with particular explanation on the task
version = "two-step"

#The text file to the prompt
f = open("prompts/prompt_inter-team_version_specific_lol", "r")
prompt = f.read().rstrip()

meta = pd.read_csv("../data/raw/team_meta.csv")

#batches = os.listdir("../data/processed/text_batches_inter-team_filtered/")
#batches = [batch.replace(".csv", "") for batch in batches]
#batches = sorted(batches)

#The first 2 collaboration batches are the ones manually curated
batches = ["1", "2"]

#model = "gpt-4-turbo"
#model = "gpt-4-0125-preview"
model = "gpt-3.5-turbo-16k"

#The level of creativity in the gpt response - for the extraction task set at 0.3
temperature = 0.7


In [7]:
def send_request_chat_competion(arguments, model = "gpt-3.5-turbo-16k", temperature = 0.7, output_format = "lol", columns = ['team', 'context', 'target']):
    
    message = arguments[0]
    variables = arguments[1]
    chunk = arguments[2]
    
    try:
        response = openai.ChatCompletion.create(model = model, messages = message, temperature = temperature, request_timeout = 60)
        final_response = response.choices[0].message["content"].strip()
    
    except:
        return(pd.DataFrame(), 0, chunk)
    
    var = final_response
    df = pd.DataFrame([[var]], columns=["text"]) 
                
    for key, value  in variables.items():
        df[key] = value              

    return(df, 1, chunk)


def process_request_regex(text, output_format = "lol", columns = ["team", "context", "target"]):
    
    if (output_format == "lol"):
        
        var = text
            
        try:

            var = text
            
            var = re.findall(r'\[["].*?["],[ ]?["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 3 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)


In [8]:


for batch_no in batches:
    
    if (not os.path.isfile("../data/processed/gpt_curated/inter-team/" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")):

        print("Started batch " + batch_no)
        start = time.time()

        batch = pd.read_csv("../data/processed/text_batches_inter-team_collab/" + batch_no + ".csv")
        
        df = pd.DataFrame()

        output_format_mistakes = []
        gpt_mistakes = []

        #Loop over all chunks of batches
        all_messages = []
        val = []
        
        for i in range(0,len(batch)):
            
            messages = [{"role": "user", "content": prompt}]
            messages.append({"role": "user", "content": "\nWe are team " + batch['team'].iloc[i] + "\n." + "The following text describes our activities in a scientific competition called iGEM and could also describe our relationships with other iGEM teams:"})
            messages.append({"role": "user", "content": batch['text'].iloc[i]})
            all_messages.extend([[messages, {"source_team": batch['team'].iloc[i], "year": batch['year'].iloc[i]}, i]])
               
        
        #for message in all_messages:
        #    
        #    output, fail, chunk = send_request_chat_competion(message)
        #    output['chunk'] = chunk
        #    df = df.append(output)
        #    
        #    if fail == 0:
        #        gpt_mistakes.extend([chunk])
                    
        with Pool(processes = 8) as pool:
    
            for output, fail, chunk in pool.map(send_request_chat_competion, all_messages):
                
                output['chunk'] = chunk
                df = df.append(output)

                if fail == 0:
                    gpt_mistakes.extend([chunk])
                
                #time.sleep(0.01)
            
        end = time.time()
        time_elapsed = (end-start)

        stats = stats.append(pd.DataFrame([["extraction", batch_no, time_elapsed, str(gpt_mistakes)]], columns=["type", "batch_no", "time_secs", "failed_chunks"]))

        print("Completed batch " + batch_no + " in " + str(time_elapsed) + " seconds")

        
        df.to_csv("../data/processed/gpt_curated/inter-team/" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)
        time.sleep(1)


Started batch 1
Completed batch 1 in 33.790950536727905 seconds
Started batch 2
Completed batch 2 in 7.660872220993042 seconds


In [9]:
stats_processing = pd.DataFrame()

for batch_no in batches:
    
    df_interactions = pd.DataFrame()
    
    df = pd.read_csv("../data/processed/gpt_curated/inter-team/" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")
    
    for loop_var in range(len(df)):
    
        t, flag = process_request_regex(str(df['text'][loop_var]))
        
        if (len(t) > 0):
        
            t['source_team'] = df['source_team'][loop_var]
            t['year'] = df['year'][loop_var]
            t['batch_id'] = batch_no
            t['chunk_id'] = df['chunk'][loop_var]
            
            df_interactions = df_interactions.append(t)                
        
        stats_processing = stats_processing.append(pd.DataFrame([[batch_no, loop_var, flag]], columns=["batch_no", "chunk_no", "status"]))
    
    df_interactions.to_csv("../data/processed/gpt_curated/inter-team/processed_" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)
    print("Completed batch " + batch_no)

Completed batch 1
Completed batch 2


In [12]:
stats.to_csv("../data/processed/gpt_curated/inter-team/stats_extraction_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")
stats_processing.to_csv("../data/processed/gpt_curated/inter-team/stats_processing_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")

In [13]:
#stats

Unnamed: 0,type,batch_no,time_secs,failed_chunks
0,extraction,1,33.790951,[]
0,extraction,2,7.660872,"[3, 7, 10, 11, 14, 15, 19, 21, 22, 26, 29, 30,..."


In [144]:
df_interactions = pd.DataFrame()

batches = ["1", "2"]

for batch_no in batches:
    
    try:
        t = pd.read_csv("../data/processed/gpt_curated/inter-team/processed_" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv")
        df_interactions = df_interactions.append(t)
        
    except:
        continue

In [145]:
#Merge all batches into one
df_interactions.to_csv("../data/processed/gpt_curated/inter-team/processed_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)