In [1]:
import os
import ast
import uuid
import pandas as pd
from model import *
from prompt import *
from batch import *
from generate_json import *

### Generate author information

In [2]:
df = pd.read_csv('cleaned_papers.csv')

def extract_institutions(row):
    parts = row.split('|')
    affliation_countries = []
    affliation_insitutions = []
    affliation_types = []
    for part in parts:
        try:
            data = ast.literal_eval(part)
            affliation_countries.append([item['country_code'] for item in data][0])
            affliation_insitutions.append([item['display_name'] for item in data][0])
            affliation_types.append([item['type'] for item in data][0])
        except:
            pass
    return affliation_countries, affliation_insitutions, affliation_types


def extract_departments(row):
    departments = []
    parts = row.split('|')
    for part in parts:
        try:
            institutions = ast.literal_eval(part)
            institution = institutions[0]
            department = institution.split(',')[0]
            departments.append(department)
        except:
            pass
    return departments


df['authorships.institutions'] = df['authorships.institutions'].astype(str)
df['affiliation_countries'], df['affiliation_institutions'], df['affiliation_types'] = zip(*df['authorships.institutions'].apply(extract_institutions))
df['authorships.raw_affiliation_strings'] = df['authorships.raw_affiliation_strings'].astype(str)
df['author_raw_affiliation'] = df['authorships.raw_affiliation_strings'].apply(extract_departments)

all_departments = set(discipline for disciplines in df['author_raw_affiliation'] for discipline in disciplines)
department_df = pd.DataFrame(all_departments, columns=['raw_affiliation'])
department_df['id'] = [str(uuid.uuid4()) for _ in range(len(department_df))]

department_df = department_df[['id', 'raw_affiliation']]
department_df.to_csv('author_affiliations.csv', index=False)

  df = pd.read_csv('cleaned_papers.csv')


### Read file and generate prompts

In [4]:
filepath = 'author_affiliations.csv'
filename = filepath.split('.')[0]

df = pd.read_csv(filepath)
df['raw_affiliation'] = df['raw_affiliation'].astype(str)
df = df.dropna(subset=['raw_affiliation'])
df['prompt'] = df.apply(create_discipline_prompt, axis=1)
df.head(5)

Unnamed: 0,id,raw_affiliation,prompt
0,4987f933-26d1-431b-b699-eaf635d33b15,,Please extract the discipline-related informat...
1,4bf4d451-8a9f-4bd8-b642-cc6a27c6f655,Morsani College of Medicine,Please extract the discipline-related informat...
2,b52bf1d6-4c65-4c09-a91b-c742fd7d76b5,UCSF Cardiovascular Research Institute,Please extract the discipline-related informat...
3,c7b59df4-1192-42d0-830e-d4e519c5392a,Albert Einstein College of Medicine,Please extract the discipline-related informat...
4,f0f786e9-7cde-43dd-9ac6-1e1c0a10a64d,School of Information Science and Engineering ...,Please extract the discipline-related informat...


In [5]:
# this is a test
prompt = df['prompt'].iloc[50]
print(prompt)
print(call_gpt4(prompt))

Please extract the discipline-related information from the following author's affiliation: Department of Molecular Medicine and Medical Biotechnology
For example, for 'Department of Biological Science, Joseph Ayo Babalola University, Nigeria', return 'Biological Science'. If it is not written in English, translate it to English and return. If you cannot identify any discipline-related information, return 'None'. Please do not return other output or explanation. 

Molecular Medicine and Medical Biotechnology


### Generate chunked jsonl file for batch input

In [6]:
def process_df_in_chunks(df, model_name, custom_id_column, prompt_column, sav_dir, save_filename, chunk_size):
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk = df.iloc[start:end]
        save_file = os.path.join(sav_dir, f'{save_filename}_{i+1}.jsonl')
        generate_jsonl(chunk, model_name, custom_id_column, prompt_column, save_file)
        print(f"Generated {save_file}")

sav_dir = filename
save_filename = filename
if not os.path.exists(sav_dir):
    os.makedirs(sav_dir, exist_ok=True)

model_name = "gpt-4o"
custom_id_column = 'id'
prompt_column = 'prompt'
process_df_in_chunks(df, model_name, custom_id_column, prompt_column, sav_dir, save_filename, chunk_size=200)

Data has been written to author_affiliations/author_affiliations_1.jsonl
Generated author_affiliations/author_affiliations_1.jsonl
Data has been written to author_affiliations/author_affiliations_2.jsonl
Generated author_affiliations/author_affiliations_2.jsonl
Data has been written to author_affiliations/author_affiliations_3.jsonl
Generated author_affiliations/author_affiliations_3.jsonl
Data has been written to author_affiliations/author_affiliations_4.jsonl
Generated author_affiliations/author_affiliations_4.jsonl
Data has been written to author_affiliations/author_affiliations_5.jsonl
Generated author_affiliations/author_affiliations_5.jsonl
Data has been written to author_affiliations/author_affiliations_6.jsonl
Generated author_affiliations/author_affiliations_6.jsonl
Data has been written to author_affiliations/author_affiliations_7.jsonl
Generated author_affiliations/author_affiliations_7.jsonl
Data has been written to author_affiliations/author_affiliations_8.jsonl
Generated 

### Process batch input to GPT

In [7]:
openai_key = get_openai_key(SECRET_FILE)
processor = OpenAIBatchProcessor(openai_key)

endpoint = "/v1/chat/completions"
completion_window = "24h"

read_dir = filename
save_dir = filename + "_response"

if not os.path.exists(save_dir):
    os.makedirs(save_dir, exist_ok=True)

read_files = os.listdir(read_dir)
read_files = [file for file in read_files if file.endswith('jsonl')]
read_files = sorted(read_files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

for file in read_files:
    read_file = os.path.join(read_dir, file)
    save_file = os.path.join(save_dir, file)
    results = processor.process_batch(read_file, endpoint, completion_window, save_file)
    print(f"Processed {read_file} and saved to {save_file}")

Processed author_affiliations/author_affiliations_1.jsonl and saved to author_affiliations_response/author_affiliations_1.jsonl
Processed author_affiliations/author_affiliations_2.jsonl and saved to author_affiliations_response/author_affiliations_2.jsonl
Processed author_affiliations/author_affiliations_3.jsonl and saved to author_affiliations_response/author_affiliations_3.jsonl
Processed author_affiliations/author_affiliations_4.jsonl and saved to author_affiliations_response/author_affiliations_4.jsonl
Processed author_affiliations/author_affiliations_5.jsonl and saved to author_affiliations_response/author_affiliations_5.jsonl
Processed author_affiliations/author_affiliations_6.jsonl and saved to author_affiliations_response/author_affiliations_6.jsonl
Processed author_affiliations/author_affiliations_7.jsonl and saved to author_affiliations_response/author_affiliations_7.jsonl
Processed author_affiliations/author_affiliations_8.jsonl and saved to author_affiliations_response/auth

### Extract content

In [8]:
files = os.listdir(save_dir)
files = [file for file in files if file.endswith('jsonl')]
files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

data = []

for file in files:
    read_file = os.path.join(save_dir, file)
    with open(read_file, 'r', encoding='utf-8') as f:
        for line in f:
            json_obj = json.loads(line.strip())
            try:
                custom_id = json_obj['custom_id']
                content = json_obj['response']['body']['choices'][0]['message']['content']
                data.append({'custom_id': custom_id, 'content': content})
            except KeyError:
                continue

df = pd.DataFrame(data)
df = df.drop_duplicates(subset=['custom_id'])
print(len(df))
df.head(5)

25541


Unnamed: 0,custom_id,content
0,4987f933-26d1-431b-b699-eaf635d33b15,
1,4bf4d451-8a9f-4bd8-b642-cc6a27c6f655,Medicine
2,b52bf1d6-4c65-4c09-a91b-c742fd7d76b5,Cardiovascular Research
3,c7b59df4-1192-42d0-830e-d4e519c5392a,
4,f0f786e9-7cde-43dd-9ac6-1e1c0a10a64d,Information Science and Engineering


In [9]:
df1 = pd.read_csv(filepath)
df1 = df1.drop_duplicates(subset=['id'])
df2 = df.merge(df1, left_on='custom_id', right_on='id', how='left')
df2 = df2.drop(columns=['custom_id'])
df2.head(5)

Unnamed: 0,content,id,raw_affiliation
0,,4987f933-26d1-431b-b699-eaf635d33b15,
1,Medicine,4bf4d451-8a9f-4bd8-b642-cc6a27c6f655,Morsani College of Medicine
2,Cardiovascular Research,b52bf1d6-4c65-4c09-a91b-c742fd7d76b5,UCSF Cardiovascular Research Institute
3,,c7b59df4-1192-42d0-830e-d4e519c5392a,Albert Einstein College of Medicine
4,Information Science and Engineering,f0f786e9-7cde-43dd-9ac6-1e1c0a10a64d,School of Information Science and Engineering ...


In [10]:
output_filepath = filename + '.json'
data_dict = pd.Series(df2.content.values, index=df2.raw_affiliation).to_dict()

with open(output_filepath, 'w') as json_file:
    json.dump(data_dict, json_file, indent=4)