In [6]:
import time
import os
from dotenv import load_dotenv
import openai
import tiktoken
import cleantext
import json
import sys
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from unidecode import unidecode
from multiprocessing import Pool
import re
from nltk.tokenize import word_tokenize


tokenizer = tiktoken.encoding_for_model("gpt-4")

load_dotenv()

#Load the api key from being an environment variable

openai.api_key = os.getenv('OPENAI_API_KEY')

stats = pd.DataFrame()

stp_words = stopwords.words('english')


In [7]:
def send_request_chat_competion(arguments, model = "gpt-4-turbo", temperature = 0.3, output_format = "lol", columns = ["team", "matching"]):
    
    message = arguments[0]
    variables = arguments[1]
    chunk = arguments[2]
    
    try:
        response = openai.ChatCompletion.create(model = model, messages = message, temperature = temperature, request_timeout = 60)
        final_response = response.choices[0].message["content"].strip()
    
    except:
        return(pd.DataFrame(), chunk)
    
    if (output_format == "lol"):
        
        var = final_response
        #var = var[var.find("[["):var.find("]]")+2]
        processed_df = pd.DataFrame([[var]], columns=["text"])
        
        #try:
        #    var = ast.literal_eval(var)
        #    processed_df = pd.DataFrame(data = var, columns = columns)
            
        for key, value  in variables.items():
            processed_df[key] = value

    return(processed_df, None)


def string_process_no_stp_words(name):
    
    processed_name = name.strip()
    
    #processed_name = processed_name.replace("_", " ")
    #processed_name = processed_name.replace("-", " ")
    processed_name = processed_name.replace(",", " ")
    processed_name = processed_name.replace("/", " ")
    
    processed_name_lower = processed_name.lower()
    
    processed_sans_sw = word_tokenize(processed_name)
    processed_sans_sw = [word for word in processed_sans_sw if not word in stp_words]
    abbr = [word[0] for word in processed_sans_sw if not word in stp_words]
    abbr = "".join(abbr)
    processed_sans_sw = " ".join(processed_sans_sw)

    return(processed_name, processed_name_lower, processed_sans_sw, abbr)


def process_team_names(name):
    
    if (len(name) > 0):
    
        #Remove _,- and keep string in uppercase (to preserve some contextual information about proper nouns)
        
        name = string_process_no_stp_words(name)[0]
        #name = name.replace("igem", " ")
        #name = name.replace("team", " ")

        if len(name) > 0:
            return(name.strip())
        
        
def process_request_regex(text, output_format = "lol", columns = ["member", "matching"]):
    
    if (output_format == "lol"):
        
        var = text
        
#        try:
            
#            var = var[var.find("[["):var.find("]]")+2]
#            var = ast.literal_eval(var)
            
#            processed_df = pd.DataFrame(data = var, columns = columns)
        
#        except:
            
        try:

            var = text

            var = re.findall(r'\[["].*?["],[ ]?["].*?["]\]', var)
            var = [ast.literal_eval(x) for x in var]

            var = [x if len(x) == 2 else None for x in var]
            var = [x for x in var if x is not None]

            if len(var) == 0:
                return(pd.DataFrame(), 2)

            processed_df = pd.DataFrame(var, columns = columns)   
                
        except:
                
            return(pd.DataFrame(), 0)
            
    return(processed_df, 1)


In [8]:
version = "specific"
output_format = "lol"
temperature_curation = 0.3

#batches = os.listdir("../data/processed/batches_inter-team_collab//")
#batches = [batch.replace(".csv", "") for batch in batches]

model = "gpt-4-turbo"
#model = "gpt-4-0125-preview"
#model = "gpt-3.5-turbo-16k"

temperature = 0.3

breaks = 10

meta = pd.read_csv("../data/raw/team_meta.csv")
roster = pd.read_csv("../data/raw/teams_info_members_db.tsv", sep="\t")

years = ["2015", "2016", "2017", "2018"]
#years = meta['Year'].unique()
#years = sorted(years)

#f = open("prompts/prompt_intra-team_matching_" + output_format, "r")
#prompt = f.read().rstrip() 

In [9]:
years = [2015, 2016, 2017, 2018]
df_names = pd.DataFrame()

for year in years:
    
    df = pd.read_csv("../data/processed/gpt_curated/intra-team/processed_" + str(year) + "_" + version + "_" + model + "_" + str(temperature_curation).replace(".","-") + ".csv")
    
    df_names = df_names.append(df[["member", "team", "year"]])
    

In [10]:
prompt = """List 1 contains the names of members of a team. List 2 contains a list of names which may or may not be a member of the team.
Find the closest match of each name in list 2 with list 1. If there is no possible good match - please mention 'other'.
         
Provide the output as a list of lists: [["name from the list 2", "closest match from list 1"]]         
            
example: 
List 1: "'Andrew Scott', 'Thomas Alves', 'Min Jang'"
List 2: "'Andrew', 'Tommy Alves', 'Charles Jang', 'Sue Perkins'"
matches: [["Andrew", "Andrew Scott"],["Tommy Alves", Thomas Alves"],["Charles Jang", "Min Jang"], ["Sue Perkins","other"]]

Please make sure there is no extra text or explanation before or after the formatted output."""

In [11]:
df_matching = pd.DataFrame()

for year in years:
    
    start = time.time()
    print("Started year " + str(year))
    
    missed_entries = []
    
    temp = df_names[(df_names['year'] == year)]  
    
    teams = temp['team'].unique()
    
    all_messages = []
    var = 0
    
    for team in teams:
    
        members = list(roster['UserName'][(roster['Year'] == year) & (roster['Team'] == team)].unique())
        members = [unidecode(x) for x in members]
        
        names = list(temp['member'][temp['team'] == team].unique())
        names = [unidecode(x) for x in names]
        names = [process_team_names(x) for x in names if not x in members]
    
        messages = [{"role": "user", "content": prompt}]
        messages.append({"role": "user", "content": "List 1:"})

        
        messages.append({"role": "user", "content": ", ".join(("\"" + x + "\"") for x in members)})
        messages.append({"role": "user", "content": "List 2:"})
        messages.append({"role": "user", "content": ", ".join(("\"" + x + "\"") for x in names)})
        
        all_messages.extend([[messages, {"year": year, "team": team}, var]])
        var = var + 1
    
    with Pool(processes = 8) as pool:
        
        zipped_output = pool.map(send_request_chat_competion, all_messages)
        
        for output, miss in list(zipped_output):
            
            df_matching = df_matching.append(output)
            
            if not miss == None:
                
                missed_entries.extend(all_messages[miss][0])
            
        time.sleep(0.01)
            
            
    end = time.time()
    time_elapsed = (end-start)
    print("finished year " + str(year) + " in " + str(time_elapsed) + " seconds")
    
    stats = stats.append(pd.DataFrame([[year, time_elapsed, str(missed_entries)]], columns=["year", "time_elapsed", "failed_chunks"]))
    time.sleep(1)    
    

Started year 2015
finished year 2015 in 23.031932830810547 seconds
Started year 2016
finished year 2016 in 25.827670097351074 seconds
Started year 2017
finished year 2017 in 14.734761714935303 seconds
Started year 2018
finished year 2018 in 16.4371919631958 seconds


In [14]:
df_matching.to_csv("../data/processed/gpt_curated/intra-team/raw_member_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-") + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)
#stats.to_csv("../data/processed/curated/fuzzy_select_inter_team_raw/stats_team_name_matchings_new_" + str(breaks) + "_" + version + "_" + output_format + "_" + str(temperature_curation).replace(".","-") + ".csv", index = False)

In [15]:
stats_processing = pd.DataFrame()

df_interactions = pd.DataFrame()
    
df = pd.read_csv("../data/processed/gpt_curated/intra-team/raw_member_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv")
    
for loop_var in range(len(df)):
    
    t, flag = process_request_regex(str(df['text'][loop_var]))
        
    if (len(t) > 0):
        
        t['year'] = df['year'][loop_var]
        t['team'] = df['team'][loop_var]
            
        df_interactions = df_interactions.append(t)                
        
    stats_processing = stats_processing.append(pd.DataFrame([[loop_var, flag]], columns=["row_no", "status"]))
    
df_interactions.to_csv("../data/processed/gpt_curated/intra-team/processed_member_matching_" + version + "_" + model + "_" + str(temperature).replace(".","-")+ "_" + str(temperature_curation).replace(".","-") + ".csv", index=False)
