In [19]:
import os
import pandas as pd
import json
import numpy as np
import datetime

# Extra Pre-Processing

In [38]:
raw_dir = './prof_raw_data/'
gpt_dir = './gpt_data/'
processed_dir = './processed_data/'
goog_prefix = 'goog_sch_'
dr_ntu_prefix = 'dr_ntu_'
education_prefix = 'education_'

In [39]:
scse_profile = pd.read_csv('./prof_raw_data/scse_profiles.csv')

## Publications per Year

In [89]:
def calc_pubs(articles):
    cur_year = datetime.date.today().year
    min_year  = np.inf
    pub_yearly = {}
    for article in articles:
        if 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)
            # unknown publish date
            if pub_year>cur_year:
                continue 
            else:               
                pub_yearly[pub_year] = pub_yearly.get(pub_year,0) + 1
    
    # Filling in 0 for years without papers
    for year in range(min_year,cur_year+1):
        if year not in pub_yearly:
            pub_yearly[year] = 0

    return pub_yearly,min_year

## Citations Per Year

In [94]:
def calc_citation(articles):
    cur_year = datetime.date.today().year
    min_year  = np.inf
    citations_yearly = {}
    for article in articles:
        if 'citation_graph' in article and 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)

            for year,citation_count in article['citation_graph'].items():
                year = int(year)
                if year<=cur_year:
                    citations_yearly[year] = citations_yearly.get(year,0)+citation_count
    

    # Filling in 0 for years without citations
    for year in range(min_year,cur_year+1):
        if year not in citations_yearly:
            citations_yearly[year] = 0
    final_citations_yearly = {}
    # Remove year < earliest publication_date, not possible to have citation before earliest publication date
    for year,citation_count in citations_yearly.items():
        if int(year)>=min_year:
            final_citations_yearly[year] = citations_yearly[year]
        

    return final_citations_yearly

## Average Citations per Paper per Year

In [100]:
def calc_avg_citation(pubs_yearly,citation_yearly,pub_min_year):
    avg_citation = {}
    cur_year = datetime.date.today().year
    citation_min_year = min(citation_yearly.keys())
    if citation_min_year==pub_min_year:        
        for year in range(pub_min_year,cur_year+1):
            pub_count_to_date = 0
            cite_count_to_date = 0
            for temp_year in range(pub_min_year,year+1):
                pub_count_to_date += pubs_yearly[temp_year]
                cite_count_to_date += citation_yearly[temp_year]
            avg_citation[year] = cite_count_to_date/pub_count_to_date
    else:
        for year in range(pub_min_year,citation_min_year):
            avg_citation[year] = 0
        for year in range(citation_min_year,cur_year):
            pub_count_to_date = 0
            cite_count_to_date = 0
            for temp_year in  range(citation_min_year,year+1):
                pub_count_to_date+= pubs_yearly[temp_year]
                cite_count_to_date+=citation_yearly[temp_year]
            avg_citation[year] = cite_count_to_date/pub_count_to_date

    return avg_citation

## h-Index

- Calculate All-Time h-Index over the years

In [92]:
def calc_h_index(articles):
    h_index_yearly = {}

    #current year
    cur_year = datetime.date.today().year
    # finding earliest publication date
    min_year = np.inf
    for article in articles:
        if 'citation_graph' in article and 'publication_date' in article:
            pub_year = int(article['publication_date'].split('/')[0])
            min_year = min(min_year,pub_year)
            if pub_year > cur_year:
                continue
            citation_count = 0
            for year in range(pub_year,cur_year+1):
                citation_count += article['citation_graph'].get(str(year),0)
                h_index_yearly[year] = h_index_yearly.get(year,[]) + [citation_count]
    for year in h_index_yearly:
        h_index_value = sum(citation_counts>=idx+1 for idx, citation_counts in enumerate(sorted(h_index_yearly[year],reverse=True)))
        h_index_yearly[year] = h_index_value

    return h_index_yearly    

## Collaboration Network

- Process co authors in a way suitable for creating network graph on dashboard

In [None]:
def process_co_authors(co_authors_url):
    

# Merging all Data Sources

- gpt_data, prof_raw_data and collaboration_network data

In [101]:
for name in scse_profile['name']:
    filename = name.lower().replace(' ','_')
    # read from raw data source and gpt data source
    with open(f"{raw_dir}{goog_prefix}{filename}.json",'r') as f:
        goog_sch_profile = json.load(f)

    with open(f"{raw_dir}{dr_ntu_prefix}{filename}.json",'r')as f:
        dr_ntu_profile = json.load(f)

    with open(f"{gpt_dir}{education_prefix}{filename}.json") as f:
        education_info = json.load(f)

    # keys to add directly to merged profile
    goog_sch_keys = ['goog_sch_url','interests','citation_table','citation_graph']
    dr_ntu_keys = ['full_name','email','name_card','designations','urls','biography','grants','keywords']
    
    merged_profile = {}

    for key in dr_ntu_keys:
        merged_profile[key] = dr_ntu_profile[key]
    
    # Skip empty goog scholar profiles
    if goog_sch_profile:
        for key in goog_sch_keys:
            merged_profile[key] = goog_sch_profile[key]               
    
    # add education background information generated from chatgpt
    merged_profile['education'] = education_info

    pub_yearly = {}
    citation_yearly = {}
    h_index_yearly = {}
    avg_citation_yearly = {}
    if 'articles' in goog_sch_profile:
        # adding pre-processed information
        pub_yearly,min_year = calc_pubs(goog_sch_profile['articles'])
        citation_yearly = calc_citation(goog_sch_profile['articles'])
        h_index_yearly = calc_h_index(goog_sch_profile['articles'])
        avg_citation_yearly = calc_avg_citation(pub_yearly,citation_yearly,min_year)

    merged_profile['pub_graph'] = pub_yearly
    merged_profile['citation_graph'] = citation_yearly
    merged_profile['h_index_graph'] = h_index_yearly
    merged_profile['avg_citation_graph'] = avg_citation_yearly

    with open(f"{processed_dir}{filename}.json",'w') as f:
        json.dump(merged_profile,f)
