In [6]:
#Load the necessary libraries
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from multiprocessing import Pool
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')

In [7]:

#Version is custom - based on my prompt styles. Specific is one-shot with particular explanation on the task
version = "two-step"

#Setting the output format as a list of list
output_format = "lol"


#The text file to the prompt
f = open("prompts/prompt_inter-team_version_specific_lol", "r")
prompt = f.read().rstrip()

meta = pd.read_csv("../data/raw/team_meta.csv")

batches = os.listdir("../data/processed/text_batches_inter-team_collab/") #all_batches_inter-team_fuzzy_select_freq/")
batches = [batch.replace(".csv", "") for batch in batches]
batches = sorted(batches)

model = "gpt-3.5-turbo-instruct"

#The level of creativity in the gpt response - for the extraction task set at 0.3
temperature = 0.3

#The number of parallel chunks to evaluate at the same time: gpt-instruct limit is 20
jump = 20


In [8]:
def send_batch_request(arguments, model = "gpt-3.5-turbo-instruct", temperature = 0.3, output_format = "lol", columns = ["team", "context", "target"]):
    
    
    chunks = arguments[0]
    variables = arguments[1]
    ids = arguments[2]
    
    #print(ids + " here")
    
    df = pd.DataFrame()
    flag = []

    try:
    
        response = openai.Completion.create(                                
                    prompt = chunks ,
                    engine = model,
                    temperature = temperature,
                    max_tokens = 2000,  
                    top_p = 1,
                    frequency_penalty = 0,
                    presence_penalty = 0,
                    timeout = 200
                    )
        
    except:
        #print(ids)
        return(df, 0)
    
    for j in range(0, len(chunks)):
        
        var = response.choices[j].text.strip()
        
        t = pd.DataFrame([[var]], columns=["text"]) 
                
        for key, value  in variables.items():
            t[key] = value[j]
                    
        df = df.append(t)
    
    return(df, 1)


In [10]:
#Token limit = 1,000,000 per minute (Tier 4 account)
#Each chunk = 500 tokens (max)
#Chunk + Prompt ~ 1000 tokens
#Expected output token size ~ 1000 tokens (max)

#Each request = 2000 tokens. Max requests per minute = 200 
#Number of requests allowed by gpt-instruct in parallel = 20 (20*2000 = 40000 tokens)
#Using multiprocessing - starting with 8 processes (GPT limit not breached)

batches = ["1", "2"]

for batch_no in batches:
    
    if (not os.path.isfile("../data/processed/gpt_curated/inter-team/" + batch_no + "_" + version + "_" + output_format + "_" + str(temperature).replace(".","-") + ".csv")):

        print("Started batch " + batch_no)
        start = time.time()

        batch = pd.read_csv("../data/processed/text_batches_inter-team_collab/" + batch_no + ".csv")

        df = pd.DataFrame()
        
        all_messages = []

        batch['processed_text'] =  prompt + "We are team " + batch['team'] + "\n." + "The following text describes our activities in a scientific competition called iGEM including our relationships with other teams:" +  batch['text']
        
        for i in range(0,len(batch), jump):
            
            temp = batch['processed_text'][i:min(i+jump,len(batch))]
            temp = list(temp)
            all_messages.extend([[temp, {"source_team": list(batch['team'][i:min(i+jump,len(batch))]), "year": list(batch['year'][i:min(i+jump,len(batch))])}, i]])
         
        for message in all_messages:
            
            output, miss = send_batch_request(message)
            print(miss)
            df = df.append(output)
            time.sleep(0.01)
        
        #After the new updates - parallel processing seems to falter with the complete endpoint, with many chunks not executed
        
        """with Pool(processes = 8) as pool:
            
            zipped_output = pool.map(send_batch_request, all_messages)
                
            for output, miss in list(zipped_output):    
                df = df.append(output)
                print(miss)
                
            time.sleep(0.01)

        """
        
        end = time.time()
        time_elapsed = (end-start)

        stats = stats.append(pd.DataFrame([[batch_no, time_elapsed]], columns=["batch_no", "time_secs"]))

        print("Completed batch " + batch_no + " in " + str(time_elapsed) + " seconds")

        df.to_csv("../data/processed/gpt_curated/inter-team/" + batch_no + "_" + version + "_" + model + "_" + str(temperature).replace(".","-") + ".csv", index=False)
        time.sleep(1)
        

Started batch 1
1
1
0
1
1
Completed batch 1 in 49.08243250846863 seconds
Started batch 2
1
1
1
1
0
Completed batch 2 in 24.289158582687378 seconds


relationships: [["CCU_Taiwan","collaborated","BIT iGEM Team"],["CCU_Taiwan","collaborated","MINGDAO iGEM team"],["CCU_Taiwan","exchanged ideas with","BIT iGEM Team"],["CCU_Taiwan","exchanged ideas with","MINGDAO iGEM team"],["BIT iGEM Team","provided suggestions to","CCU_Taiwan"],["MINGDAO iGEM team","held debate with","CCU_Taiwan"],["MINGDAO iGEM team","provided venue for debate","CCU_Taiwan"]]

398

In [None]:
fn = "../data/processed/curated/fuzzy_select_inter_team_raw/stats_" + version + "_" + output_format + "_" + str(temperature).replace(".","-") + ".csv"

if os.path.isfile(fn):
    
    stats_old = pd.read_csv(fn)
    stats = stats_old.append(stats)
    
stats.to_csv(fn, index = False)