In [7]:
import os
from dotenv import load_dotenv
load_dotenv()
import openai
import json
import ast
import time
from tqdm import tqdm
import re
import datetime

In [2]:
# Set openai.api_key to the OPENAI environment variable
openai.api_key = os.environ["api_key"]

# Things to Note

- Replace the key with your own key
- Note that free api key is only limited to 3 calls per minute

# Extract Education Background from Biography

- system_msg and prompt to instruct LLM to return information in dictionary format

In [5]:
def extract_education_background(text):
    system_msg = "I am an AI assistant designed to help you extract educational background information from a given text. \
    I understand that the text contains a biography and I am capable of identifying and extracting details about the person's education, \
    including their Bachelor's, Master's, and PhD degrees. I also understand that after extracting the following educational information from the text\
    I have to dispaly it in the following format: {'Bachelor Degree': 'Institution', 'Master Degree': 'Institution', 'PhD': 'Institution'}. \
    If there is no information about a particular degree in the text, please return None for the corresponding key."

    prompt = f"""
    Based on the biography below, tell me where did this person got his Bachelor's Degree, Master's Degree and PhD from. Return the results strictly in a dictionary format as such, {{"Bachelor Degree": "Institution", "Master Degree": "Institution", "PhD": "Institution"}}. If there is no information or if information is ambiguous about a particular degree in the text, please return "None" for the corresponding key. Also please make sure key and value is between double quote "string".

    Biography:

    {text}
    """
    response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": system_msg },
                    {"role": "user", "content": prompt}
            ],
            temperature=0)
    return response['choices'][0]['message']['content']

In [222]:
text =  "Jie Zhang is a Professor of the School of Computer Science and Engineering at NTU Singapore, leading the Computational Intelligence Group. He obtained Ph.D. in Cheriton School of Computer Science from University of Waterloo and was the recipient of Alumni Gold Medal in 2009. Then he joined NTU as an Assistant Professor and was promoted to Associate Professor in 2015. From 2017-2018, he was appointed as Tan Chin Tuan Exchange Fellowship, New York University. He was also an Adjunct Fellow, Singapore Institute of Manufacturing Technology (SIMTech), A*STAR, from 2020-2021. His papers have been published by top journals and conferences and won several best paper awards. Jie Zhang is also active in serving research communities."

response = extract_education_background(text=text)


In [223]:
print(response)

{"Bachelor Degree": "None", "Master Degree": "None", "PhD": "University of Waterloo"}


In [None]:
raw_dir = './prof_raw_data/'
prefix = 'education'
gpt_dir = './gpt_data/'

dr_ntu_files = [os.path.join(raw_dir, f) for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f)) and 'dr_ntu' in f]

for file in tqdm(dr_ntu_files[82:],position=0,leave=True):
    with open(file,'r') as f:
        prof_dict = json.load(f)
    biography = prof_dict['biography']
    name = prof_dict['full_name'].lower().replace(' ','_')
    # pass in biography to gpt3.5
    education_detail_str = extract_education_background(biography)
    #convert string to dictionary
    education_detail_dict = ast.literal_eval(education_detail_str.strip())
    for key,value in education_detail_dict.items():
        if value =="None":
            education_detail_dict[key] = None
    with open(f"{gpt_dir}{prefix}_{name}.json",'w') as f:
        json.dump(education_detail_dict,f)


# Extract Recent Research Interest

- For each Professor, pass in last 3 years worth of paper to chatgpt and return a topic for each publication

In [3]:
def extract_research_interest(title,description,temp):
    system_msg = "I am an AI assistant designed to help you identify the field of research for a given publication based on its text description and title. \
    I understand that the text and title contains important information regarding the field of research for that publication and I will use those to help me identify a single research topic for that publication from a list of possible research topic.\
    I understand that the research topic that I return can only come from the following list; Artificial Intelligence,Machine Learning,Federated Learning,Reinforcement Learning,Multimodal learning,Natural Language Processing,Cybersecurity,Deep Learning,Quantum Computing,Computer Vision,Blockchain Technology,Internet of Things,Robotics,Human-Computer Interaction,Data-Mining\
    The research topic should be None if there is not enough information to determine which research topic it fall under in the list. \
    I understand that the response has to be between two tags, <answer> followed by </answer>, below is an example of what i should return. \
    <answer>Artificial Intelligence</answer>"
    
    prompt = f"""
    Based on the Title and Description of the publication provided, choose a research topic from the following list that best matches the publication. 
    Artificial Intelligence,Machine Learning,,Federated Learning,Reinforcement Learning,Natural Language Processing,Multimodal learning,Cybersecurity,Deep Learning,Quantum Computing,Computer Vision,Blockchain Technology,Internet of Things,Robotics,Human-Computer Interaction,Data-Mining
    The research topic should be None if there is not enough information to determine which research topic it fall under in the list.
    Your response has to be between two tags, <answer> followed by </answer>, below is an example of what you should return.
    <answer>Artificial Intelligence</answer>. 

    Title: {title}
    Description :{description}
    """
    while True:
        try:
                response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=[{"role": "system", "content": system_msg },
                                {"role": "user", "content": prompt}
                        ],
                        temperature=temp,
                        request_timeout=15)
                
                return response['choices'][0]['message']['content']
        except:
             pass

In [4]:
raw_dir = './prof_raw_data/'
prefix = 'interest_'
gpt_dir = './gpt_data/'

goog_sch_files = [os.path.join(raw_dir, f) for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f)) and 'goog_sch' in f]

In [6]:
topic_list= ['Artificial Intelligence','Machine Learning','Natural Language Processing','Cybersecurity','Quantum Computing','Computer Vision','Blockchain Technology','Internet of Things','Robotics','Human-Computer Interaction','Data-Mining','None']
for filepath in tqdm(goog_sch_files[14:],position=0,leave=True):
    filename = filepath[25:-5]
    with open(filepath,'r')as f:
        profile = json.load(f)
    
    # Initialise dictionary of list to store recent field of research for each author
    research_interests = {'interests':[]}

    if 'articles' in profile:
        articles = profile['articles']

        # append recent_articles from last three years
        cur_year = datetime.date.today().year
        recent_articles = []
        for article in articles:
            if 'publication_date' in article:
                if int(article['publication_date'].split('/')[0])>=(cur_year-2):
                    recent_articles.append(article)

        # For each recent_article, pass in title and description as inputs to gpt3.5
        for recent_article in recent_articles:
            temp =0 
            # loop until gpt3.5 returns a output in the pre-defined format
            while True:
                # check if title and description exist for current article
                if 'title' in recent_article and 'description' in recent_article:
                    response = extract_research_interest(recent_article['title'],recent_article['description'],temp)
                    # extract the answer between the tags <answer> </answer>
                    research_topic = re.search(r"(?<=<answer>).*?(?=</answer>)",response)
                    if research_topic:
                        temp=0
                        research_interests['interests'].append(research_topic[0])
                        break
                    # increase temperature if output is not in valid format
                    else:
                        temp+=0.1
                        print(response)
                else:
                    #skip the current article if no title or description
                    break        
        
    with open(f"{gpt_dir}{prefix}{filename}.json",'w') as f:
        json.dump(research_interests,f)
                

100%|██████████| 72/72 [2:45:17<00:00, 137.74s/it]  
