In [42]:
import os
import pandas as pd
import json
import numpy as np
import datetime
import ast

# Individual Profile Page

- Processing and raw_data for Individual Profile Page

In [43]:
#directories
raw_dir = './prof_raw_data/'
gpt_dir = './gpt_data/'
processed_dir = './processed_data/'
co_author_dir = './co_author_data/'
profile_image_dir = './profile_image/'

# prefixes
goog_prefix = 'goog_sch_'
dr_ntu_prefix = 'dr_ntu_'
education_prefix = 'education_'
interest_prefix = 'interest_'


In [44]:
scse_profile = pd.read_csv('./prof_raw_data/scse_profiles.csv')

## Publications per Year

In [45]:
def calc_pubs(articles):
    cur_year = datetime.date.today().year
    min_year  = np.inf
    pub_yearly = {}
    for article in articles:
        if 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)
            # unknown publish date
            if pub_year>cur_year:
                continue 
            else:               
                pub_yearly[pub_year] = pub_yearly.get(pub_year,0) + 1
    
    # Filling in 0 for years without papers
    for year in range(min_year,cur_year+1):
        if year not in pub_yearly:
            pub_yearly[year] = 0

    return pub_yearly,min_year

## Citations Per Year

In [46]:
def calc_citation(articles):
    cur_year = datetime.date.today().year
    min_year  = np.inf
    citations_yearly = {}
    for article in articles:
        if 'citation_graph' in article and 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)

            for year,citation_count in article['citation_graph'].items():
                year = int(year)
                if year<=cur_year:
                    citations_yearly[year] = citations_yearly.get(year,0)+citation_count
    

    # Filling in 0 for years without citations
    for year in range(min_year,cur_year+1):
        if year not in citations_yearly:
            citations_yearly[year] = 0
    final_citations_yearly = {}
    # Remove year < earliest publication_date, not possible to have citation before earliest publication date
    for year,citation_count in citations_yearly.items():
        if int(year)>=min_year:
            final_citations_yearly[year] = citations_yearly[year]
        

    return final_citations_yearly

## Average Citations per Paper

In [47]:
def calc_avg_citation(pubs_yearly,citation_yearly,pub_min_year):
    avg_citation = {}
    cur_year = datetime.date.today().year
    citation_min_year = min(citation_yearly.keys())
    if citation_min_year==pub_min_year:        
        for year in range(pub_min_year,cur_year+1):
            pub_count_to_date = 0
            cite_count_to_date = 0
            for temp_year in range(pub_min_year,year+1):
                pub_count_to_date += pubs_yearly[temp_year]
                cite_count_to_date += citation_yearly[temp_year]
            avg_citation[year] = cite_count_to_date/pub_count_to_date
    else:
        for year in range(pub_min_year,citation_min_year):
            avg_citation[year] = 0
        for year in range(citation_min_year,cur_year+1):
            pub_count_to_date = 0
            cite_count_to_date = 0
            for temp_year in  range(citation_min_year,year+1):
                pub_count_to_date+= pubs_yearly[temp_year]
                cite_count_to_date+=citation_yearly[temp_year]
            avg_citation[year] = cite_count_to_date/pub_count_to_date

    return avg_citation

## Average Publication per Year

In [48]:
def calc_avg_pub(pubs_yearly,pub_min_year):
    avg_pub = {}
    cur_year = datetime.date.today().year
    for year in range(pub_min_year,cur_year+1):
        pub_count_to_date = 0
        for temp_year in range(pub_min_year,year+1):
            pub_count_to_date += pubs_yearly[temp_year]
        avg_pub[year] = pub_count_to_date/(year+1-pub_min_year)

    return avg_pub

## h-Index

- Calculate All-Time h-Index over the years

In [49]:
def calc_h_index(articles):
    h_index_yearly = {}

    #current year
    cur_year = datetime.date.today().year
    # finding earliest publication date
    min_year = np.inf
    for article in articles:
        if 'citation_graph' in article and 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)
            if pub_year > cur_year:
                continue
            citation_count = 0
            for year in range(pub_year,cur_year+1):
                citation_count += article['citation_graph'].get(str(year),0)
                h_index_yearly[year] = h_index_yearly.get(year,[]) + [citation_count]
    for year in h_index_yearly:
        h_index_value = sum(citation_counts>=idx+1 for idx, citation_counts in enumerate(sorted(h_index_yearly[year],reverse=True)))
        h_index_yearly[year] = h_index_value

    return h_index_yearly    

## i10-index


In [50]:
def calc_i10_index(articles):
    i10_index_yearly = {}

    #current year
    cur_year = datetime.date.today().year
    # finding earliest publication date
    min_year = np.inf
    for article in articles:
        if 'citation_graph' in article and 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)
            if pub_year > cur_year:
                continue
            citation_count = 0
            for year in range(pub_year,cur_year+1):
                citation_count += article['citation_graph'].get(str(year),0)
                i10_index_yearly[year] = i10_index_yearly.get(year,[]) + [citation_count]
    for year in i10_index_yearly:
        i10_index_value = sum(citation_counts>=10 for citation_counts in i10_index_yearly[year])
        i10_index_yearly[year] = i10_index_value

    return i10_index_yearly    

# Merging all Data Sources

- gpt_data, prof_raw_data and collaboration_network data
- Save to processed_data directory

In [51]:
for name in scse_profile['name']:
    filename = name.lower().replace(' ','_')
    # read from raw data source and gpt data source
    with open(f"{raw_dir}{goog_prefix}{filename}.json",'r') as f:
        goog_sch_profile = json.load(f)

    with open(f"{raw_dir}{dr_ntu_prefix}{filename}.json",'r')as f:
        dr_ntu_profile = json.load(f)

    with open(f"{gpt_dir}{education_prefix}{filename}.json",'r') as f:
        education_info = json.load(f)
    
    with open(f"{gpt_dir}{interest_prefix}{filename}.json",'r') as f:
        research_interest = json.load(f)

    with open(f"{co_author_dir}{filename}.json",'r') as f:
        co_author_network = json.load(f)
    
    # keys to add directly to merged profile
    goog_sch_keys = ['goog_sch_url','citation_table','citation_graph','co_authors_url']
    dr_ntu_keys = ['full_name','image_path','email','name_card','designations','urls','biography','grants','patents','keywords']
    
    merged_profile = {}

    for key in dr_ntu_keys:
        merged_profile[key] = dr_ntu_profile[key]
        
    if goog_sch_profile:
        for key in goog_sch_keys:
            merged_profile[key] = goog_sch_profile[key]
    else:
        for key in goog_sch_keys:
            if key=='goog_sch_url':
                merged_profile[key] = None
            elif key=='interests':
                merged_profile[key] = []
            elif key=='citation_table':
                merged_profile[key] = {}
            elif key=='citation_graph':
                merged_profile[key] = {}
            elif key=='co_authors_url':
                merged_profile[key] = []
    if 'google_scholar' not in merged_profile['urls']:
        merged_profile['urls']['google_scholar'] = merged_profile['goog_sch_url']
    merged_profile.pop('goog_sch_url')
    if 'scopus' not in merged_profile['urls']:
        merged_profile['urls']['scopus'] = None
    if 'web_of_science' not in merged_profile['urls']:
        merged_profile['urls']['web_of_science'] = None

    # add education background information generated from chatgpt
    merged_profile['education'] = education_info

    # add recent research interest generated based on author's recent articles
    research_interest = list(set(research_interest['interests']))
    # remove None which is returned if gpt3.5 is unsure of field of research for a specific article and invalid outputs that are very long(sentences)
    research_interest = [interest for interest in research_interest if interest!="None" and len(interest)<50]
    merged_profile['interests'] = research_interest

    # add co_author_network dictionary
    merged_profile['co_author_network'] = co_author_network

    pub_yearly = {}
    citation_yearly = {}
    h_index_yearly = {}
    avg_citation_yearly = {}
    if 'articles' in goog_sch_profile:
        # adding pre-processed information
        pub_yearly,min_year = calc_pubs(goog_sch_profile['articles'])
        citation_yearly = calc_citation(goog_sch_profile['articles'])
        h_index_yearly = calc_h_index(goog_sch_profile['articles'])
        i10_index_yearly = calc_i10_index(goog_sch_profile['articles'])
        avg_citation_yearly = calc_avg_citation(pub_yearly,citation_yearly,min_year)
        avg_pub_yearly = calc_avg_pub(pub_yearly,min_year)

        merged_profile['pub_graph'] = pub_yearly
        merged_profile['citation_graph'] = citation_yearly
        merged_profile['h_index_graph'] = h_index_yearly
        merged_profile['i10_index_graph'] = i10_index_yearly
        merged_profile['avg_citation_graph'] = avg_citation_yearly
        merged_profile['avg_pub_graph'] = avg_pub_yearly
    else:
        merged_profile['pub_graph'] = {}
        merged_profile['citation_graph'] = {}
        merged_profile['h_index_graph'] = {}
        merged_profile['i10_index_graph'] = {}
        merged_profile['avg_citation_graph'] = {}
        merged_profile['avg_pub_graph'] = {}
        
    with open(f"{processed_dir}{filename}.json",'w') as f:
        json.dump(merged_profile,f)


# SCSE Page

- Processing the research interest labels returned by gpt
    - Merge duplicate label e.g Adversarial Attack vs Adversarial Attack
- Add recent research interest, since 2018 citation count, publication count, h-index,i10-index to scse profile table to be displayed
- Merge to ./prof_raw_data/scse_profile.csv and save in ./processed_data/scse_profile.csv to be displayed on streamlit dashboard

In [52]:
scse_profile = pd.read_csv('./prof_raw_data/scse_profiles.csv')

In [53]:
scse_profile = scse_profile.drop(columns=['Unnamed: 0','dr_ntu_url'])
new_cols = ['name','email']
scse_profile = scse_profile[new_cols]

In [54]:
name_list = scse_profile['name'].tolist()

In [55]:
# add all labels to set and merge manually 
unique_research_interest = set()
for name in name_list:
    with open(f"./gpt_data/interest_{name.lower().replace(' ','_')}.json",'r')as f:
        profile_interest = json.load(f)

    if profile_interest['interests']:
        # add recent research interest generated based on author's recent articles
        profile_interest = list(set(profile_interest['interests']))
        # remove None which is returned if gpt3.5 is unsure of field of research for a specific article and invalid outputs that are very long(sentences)
        profile_interest = [interest for interest in profile_interest if interest!="None" and len(interest)<50]
        for interest in profile_interest:
            unique_research_interest.add(interest)

## Labels to Merge
- All variations of 
    - Adversarial Attack
    - Autonomous Driving System
    - Brain Computer Interface
    - Explainable AI
    - Generative Adversarial Networks
    - Graph Convolution Networks
    - Medical Image Segmentation
    - Wireless Communication Systems

In [56]:
unique_research_interest

{'Adversarial Attack',
 'Adversarial Attacks',
 'Adversarial Examples',
 'Artificial Intelligence',
 'Autonomous Driving System Testing',
 'Autonomous Driving Systems',
 'Autonomous Driving Systems (ADSs)',
 'Autonomous Vehicles',
 'Biomedical Engineering',
 'Blockchain Technology',
 'Brain Machine Interface',
 'Brain-Computer Interface',
 'Brain-Computer Interface (BCI)',
 'Cloud Computing',
 'Cloud Gaming',
 'Collaborative Filtering',
 'Computer Vision',
 'Cryptanalysis',
 'Cybersecurity',
 'Data-Mining',
 'Database Systems',
 'Deep Learning',
 'Distributed Computing',
 'Domain Confused Contrastive Learning',
 'Energy Harvesting',
 'Evolutionary Computation',
 'Explainable AI',
 'Explainable AI (XAI)',
 'Explainable AI Design',
 'Explainable Artificial Intelligence',
 'Explainable Artificial Intelligence (XAI)',
 'Fairness as Decision Rationale Alignment',
 'Fairness in Artificial Intelligence',
 'Federated Learning',
 'Game Theory',
 'Generative Adversarial Networks',
 'Generative A

In [57]:
def merging_interests(interest):
    if 'Adversarial' in interest:
        return "Adversarial Attack"
    if "Autonomous Driving" in  interest or "Autonomous Vehicle" in interest:
        return "Autonomous Driving System"
    if "Brain-Computer Interface" in interest:
        return "Brain-Computer Interface"
    if "Explainable AI" in interest or "Explainable Artificial Intelligence" in interest:
        return "Explainable AI"
    if 'Graph Convolutional Network' in interest:
        return 'Graph Convolutional Network'
    if 'Wireless Communication' in interest:
        return "Wireless Communication"
    return interest

## Faculty Member Table 
- save it as faculty_member.csv in processed_data directory

In [58]:
# merging professor's recent research interest that we got from chatgpt
interest_list = []
pub_list = []
citation_list = []
for name in name_list:
    with open(f"./gpt_data/interest_{name.lower().replace(' ','_')}.json",'r')as f:
        profile_interest = json.load(f)

    with open(f"./prof_raw_data/goog_sch_{name.lower().replace(' ','_')}.json",'r') as f:
        goog_sch_profile = json.load(f)

    if profile_interest['interests']:
        # add recent research interest generated based on author's recent articles
        profile_interest = list(set(profile_interest['interests']))
        # remove None which is returned if gpt3.5 is unsure of field of research for a specific article and invalid outputs that are very long(sentences)
        filtered_interest = []
        for interest in profile_interest:
            if interest!="None" and len(interest)<50:
                filtered_interest.append(merging_interests(interest))
        interest_list.append(','.join(filtered_interest))
    else:
        interest_list.append(' ')

    if 'articles' in goog_sch_profile:
        if goog_sch_profile['articles']:
            recent_pub_count = 0 
            recent_citation_count = 0
            for article in goog_sch_profile['articles']:
                if 'publication_date' in article:
                    if int(article['publication_date'].split('/')[0])>=2020:
                        recent_pub_count+=1
                        if 'total_citations' in article:
                            recent_citation_count+=article['total_citations']
            citation_list.append(recent_citation_count)
            pub_list.append(recent_pub_count)           
    else:
        citation_list.append(np.nan)
        pub_list.append(np.nan)


In [59]:
scse_profile['Recent Research Interest'] = interest_list
scse_profile['Recent Publication Count'] = pub_list
scse_profile['Recent Citation Count'] = citation_list
scse_profile['Avg Citation Per Paper'] = (scse_profile['Recent Citation Count']/scse_profile['Recent Publication Count']).round(2)

scse_profile = scse_profile.rename(columns={'name':'Name','email':'Email'})

In [60]:
scse_profile.to_csv('./processed_data/faculty_member.csv')

In [61]:
scse_profile

Unnamed: 0,Name,Email,Recent Research Interest,Recent Publication Count,Recent Citation Count,Avg Citation Per Paper
0,A S Madhukumar,asmadhukumar@ntu.edu.sg,"Cybersecurity,Reinforcement Learning,Deep Lear...",32.0,255.0,7.97
1,Alexei Sourin,assourin@ntu.edu.sg,"Machine Learning,Multimodal learning,Deep Lear...",22.0,140.0,6.36
2,Anupam Chattopadhyay,anupam@ntu.edu.sg,"Cybersecurity,Neuromorphic Computing,Natural L...",107.0,1011.0,9.45
3,Anwitaman Datta,anwitaman@ntu.edu.sg,"Cybersecurity,Natural Language Processing,Bloc...",24.0,81.0,3.38
4,Arvind Easwaran,arvinde@ntu.edu.sg,"Cybersecurity,Reinforcement Learning,Machine L...",47.0,204.0,4.34
...,...,...,...,...,...,...
81,Zhang Jie,zhangj@ntu.edu.sg,"Operations Research,Recommender Systems,Reinfo...",65.0,1400.0,21.54
82,Zhang Tianwei,tianwei.zhang@ntu.edu.sg,"Cybersecurity,Artificial Intelligence,Reinforc...",156.0,1612.0,10.33
83,Zhao Jun,junzhao@ntu.edu.sg,"Adversarial Attack,Wireless Communication,Cybe...",172.0,5110.0,29.71
84,Zheng Jianmin,asjmzheng@ntu.edu.sg,"Machine Learning,Multimodal learning,Computer ...",46.0,349.0,7.59


# Faculty Metrics

- Total Number of Publications, Citations,Grants

In [62]:
metric = {'pubs':{'Total Publications':0,'delta':0},'citations':{'Total Citations':0,'delta':0},'total_grants':0,'total_patents':0}
for name in name_list:
    with open(f"{raw_dir}{dr_ntu_prefix}{name.lower().replace(' ','_')}.json",'r')as f:
        dr_ntu_profile = json.load(f)

    with open(f"{processed_dir}{name.lower().replace(' ','_')}.json", 'r')as f:
        processed_profile = json.load(f)
    cur_year = str(datetime.date.today().year)
    if cur_year in processed_profile['pub_graph']:
        metric['pubs']['delta'] += processed_profile['pub_graph'][cur_year]
    if cur_year in processed_profile['citation_graph']:
        metric['citations']['delta'] += processed_profile['citation_graph'][cur_year]
    metric['pubs']['Total Publications'] += sum(processed_profile['pub_graph'].values())
    metric['citations']['Total Citations'] += sum(processed_profile['citation_graph'].values())
    metric['total_grants'] += len(processed_profile['grants'])
    metric['total_patents'] += len(processed_profile['patents'])
    
with open('./processed_data/scse_metric.json','w')as f:
    json.dump(metric,f)

In [None]:
dd