In [29]:
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import numpy as np
import ast
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from multiprocessing import Pool
import re
from nltk.tokenize import word_tokenize

#from ipynb.fs.full.gpt_curation_custom_functions import string_process_no_stp_words, process_team_names
#from ipynb.fs.full.gpt_curation_custom_functions import fuzzy_match, fuzzy_search

tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')

In [30]:
def send_request_chat_competion(arguments, model = "gpt-4-0125-preview", temperature = 0.3, output_format = "lol", columns = ["relationship", "matching"]):
    
    message = arguments[0]
    variables = arguments[1]
    chunk = arguments[2]
    
    try:
        response = openai.ChatCompletion.create(model = model, messages = message, temperature = temperature, request_timeout = 60)
        final_response = response.choices[0].message["content"].strip()
    
    except:
        return(pd.DataFrame(), chunk)
    
    if (output_format == "lol"):
        
        var = final_response
        #var = var[var.find("[["):var.find("]]")+2]
        processed_df = pd.DataFrame([[var]], columns=["text"])
        
        #try:
        #    var = ast.literal_eval(var)
        #    processed_df = pd.DataFrame(data = var, columns = columns)
            
        for key, value  in variables.items():
            processed_df[key] = value

    return(processed_df, None)

def process_request_regex(text, output_format = "lol", columns = ["member", "matching"]):
    
    if (output_format == "lol"):
        
        var = text
        
            
        try:

            var = text

            var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 2 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)

In [31]:
version = "specific"
output_format = "lol"
temperature_curation = 0.3

#batches = os.listdir("../data/processed/batches_inter-team_collab/")
#batches = [batch.replace(".csv", "") for batch in batches]

#model = "gpt-3.5-turbo-16k"
model = "gpt-4-0125-preview"
#model = "gpt-4-turbo"

temperature = 0.3

breaks = 10

meta = pd.read_csv("../data/raw/team_meta.csv")

#f = open("prompts/prompt_inter-team_matching_" + output_format, "r")
#prompt = f.read().rstrip() 

In [32]:
years = [2015, 2016, 2017, 2018]
df_names = pd.DataFrame()

for year in years:
    
    df = pd.read_csv("../data/processed/gpt_curated/intra-team/processed_" + str(year) + "_" + version + "_" + model + "_" + str(temperature_curation).replace(".","-") + ".csv")
    
    df_names = df_names.append(df[["activity", "year"]])
    

In [33]:
#manual = pd.read_csv("../data/processed/manually curated/random_per_year_intra_team/intra-team_manual_annotated.csv")
#df_names = manual.filter(items=['activities', 'Year'])
#df_names = df_names.rename(columns={"activities": "activity", "Year": "year"})

In [34]:
#df_interactions.to_csv("../data/processed/test_matching_activities.csv", index=False)

In [35]:
#Prompt

prompt = """

Each phrase describes activities performed as a part of a team participating in the synthetic biology iGEM competition. 

Given a list of phrases, match each phrase with the categories listed below. Some phrases can be matched to multiple categories.

Provide the output as a list of lists - [["PHRASE", "MATCHING CATEGORY"]].

The possible categories, along with their description are:
Design: Conceptualising, doing background research and/or designing the teams' project idea.
Experiments: Performed synthetic biology experiments in the wet laboratory as a part of the teams' project.
Documentation: Creating, managing and editing the teams' wiki website, report writing and scientific documentation
Interlab: Performed the interlab study.
Modelling: Performed mathematical models, computer simulations and/or used engineering principles to model their project.
Analysis: Performed dry lab work, data curation, data analysis and/or bioinformatics as a part of the teams' project
Parts: Was responsible for creating, characterising and documenting basic or composite synthetic biology parts, also called biobricks.
Safety: Was responsible for addressing safety considerations of the teams' project.
Entrepreneurship: Was involved with building a business case and/or commercializing the teams' project.
Hardware: Worked with or built hardware components for their teams' project.
Software: Created computational tools and/or software as a part of their teams' project.
Human Practices: Was responsible to understand the ethical, legal, economic and social considerations of the teams' project. Sometimes abbreviated as 'HP'.
Public Engagement: Established a public dialogue through outreach, educational tools and/or social media to discuss their project, science and synthetic biology with people outside the lab.
Collaboration: Was responsible for collaborating with other teams participating in the iGEM competition.
Fundraising: Was responsible in fundraising and/or finding sponsors for the teams' project.
Creative Contributions: Making presentations, designing team logos and suits, creating art pieces, and producing promotional materials.
Administration: Was responsible for management, organisation and coordinating the laboratory and/or activities of the teams' project.
Material Supply: Providing laboratory space, equipment, supplies and providing technical materials or reagents
Supervision: Provided advice, feedback, support, guidance, assistance or help in various aspects of a teams' project by being a Principal Investigator (PI), advisor or instructor.
Training: Conducting educational workshops, courses and/or teaching lab techniques.
Other: For descriptions that are not matched to any of the categories above.

example: '"project conceptualization and working in the wetlab", "outreach"'
Matching: [["project conceptualization and working in the wetlab", "Design"], ["project conceptualization and working in the wetlab", "Experiments"], ["outreach","Public Outreach"]]

example: '"flux balance analysis", "Advisor for modelling tasks"'
Matching: [["flux balance analysis","Experiments"],["Advisor for modelling tasks","Supervision"], ["Advisor for modelling tasks","Modelling"]]

example: '"assisted the team on many technical concepts", "made drawn images on the wiki", "absolutely love food and sleep"'
Matching: [["assisted the team on many technical concepts", "Supervision"],["made drawn images on the wiki", "Documentation"], ["made drawn images on the wiki", "Creative Contributions"],["absolutely love food and sleep", "other"]]

The following are the list of phrases:

"""

#Include more context for GPT to understand the iGEM specific background
#Ask gpt to refine the prompt
#Groupings in confusion matrix - dendogram
#Examples for each categories


In [36]:
breaks = 10

df_matching = pd.DataFrame()

for year in sorted(df_names['year'].unique()):
    
    start = time.time()
    print("Started year " + str(year))
    
    missed_entries = []
    
    temp = df_names[df_names['year'] == year] 
    
    
    l = list(temp['activity'].unique())
    l = [x.lower() for x in l if x == x] # To remove np.nan. Regular method seems to not work
    
    all_messages = []
    var = 0
    
    for j in range(0,len(l),breaks):
    
        messages = [{"role": "user", "content": prompt}]

        messages.append({"role": "user", "content": ', '.join(("\"" + x + "\"") for x in l[j:min(j+breaks,len(l))])})
        
        all_messages.extend([[messages, {"year": year}, var]])
        var = var + 1
    
    with Pool(processes = 8) as pool:
        
        zipped_output = pool.map(send_request_chat_competion, all_messages)
        
        for output, miss in list(zipped_output):
            
            df_matching = df_matching.append(output)
            
            if not miss == None:
                
                missed_entries.extend(all_messages[miss][0])
            
        time.sleep(0.01)
            
            
    end = time.time()
    time_elapsed = (end-start)
    print("finished year " + str(year) + " in " + str(time_elapsed) + " seconds")
    
    stats = stats.append(pd.DataFrame([[year, time_elapsed, str(missed_entries)]], columns=["year", "time_elapsed", "failed_chunks"]))
    time.sleep(1)    
    

Started year 2015
finished year 2015 in 42.650538206100464 seconds
Started year 2016
finished year 2016 in 63.40220403671265 seconds
Started year 2017
finished year 2017 in 40.27959966659546 seconds
Started year 2018
finished year 2018 in 57.97981309890747 seconds


In [37]:
df = pd.DataFrame()
columns = ["Activity", "Matching"]

for i in range(len(df_matching)):
    
    var = df_matching['text'].iloc[i]
        
    try:
        
        #var = text

        var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
        var = [ast.literal_eval(x) for x in var]

        var = [x if len(x) == 2 else None for x in var]
        var = [x for x in var if x is not None]

        #if len(var) == 0:
        #    t = pd.DataFrame()

        t = pd.DataFrame(var, columns = columns)   
        t['year'] = df_matching['year'].iloc[i]
                
    except:
                
        t = pd.DataFrame()
            
    if len(t) > 0:
        df = df.append(t)

In [38]:
#df.to_csv("../data/processed/curated/intra-team_verify/relationship_matching_manual.csv", index = False)

In [39]:
#df_matching.to_csv("../data/processed/curated/intra-team_verify/raw_relationship_matching_manual.csv", index = False)

In [40]:
df.to_csv("../data/processed/gpt_curated/intra-team/processed_relationship_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv", index=False)
df_matching.to_csv("../data/processed/gpt_curated/intra-team/raw_relationship_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv", index=False)