In [1]:
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px
plt.style.use('seaborn')

In [2]:
all_info = pd.read_csv('all-author-info.csv')

In [3]:
all_info

Unnamed: 0,name,dept,area,count,adjustedcount,year,all_pub
0,Ming Zhang 0004,Peking University,acl,1.0,0.200000,2020,Learning to Customize Model Structures for Few...
1,Jong-Hyeok Lee,POSTECH,acl,1.0,0.333333,2010,Evaluating Multilanguage-Comparability of Subj...
2,Sam Wiseman,Duke University,acl,2.0,0.500000,2020,Discrete Latent Variable Representations for L...
3,Karl Stratos,Rutgers University,acl,1.0,0.250000,2020,Discrete Latent Variable Representations for L...
4,Karen Livescu,TTI Chicago,acl,2.0,0.500000,2020,Discrete Latent Variable Representations for L...
...,...,...,...,...,...,...,...
131040,Diodato Ferraioli,University of Salerno,wine,1.0,0.200000,2017,Information Retention in Heterogeneous Majorit...
131041,Clemente Galdi,University of Salerno,wine,1.0,0.200000,2017,Information Retention in Heterogeneous Majorit...
131042,Giuseppe Persiano,University of Salerno,wine,1.0,0.200000,2017,Information Retention in Heterogeneous Majorit...
131043,Avrim Blum,TTI Chicago,wine,1.0,0.333333,2015,Online Allocation and Pricing with Economies o...


# Research Focus Changes With Time

## School Level Streamgraph

In [4]:
def count_area_year_pub(df, year):
    df = df.loc[df['year']==year]
    return df.groupby('area')['count'].sum().to_dict()

def get_topk_count(all_year_info, k):
    all_area = {}
    for year_info in all_year_info:
        for key in year_info:
            if key not in all_area:
                all_area[key] = 0
            all_area[key] += year_info[key]
    top30_keys = [k for k, v in sorted(all_area.items(), key=lambda item: item[1], reverse=True)[:k]]
    counts = {}
    for key in top30_keys:
        counts[key] = []
        for year_info in all_year_info:
            if key in year_info:
                counts[key].append(year_info[key])
            else:
                counts[key].append(0)
    return counts

def plot_streamgraph(counts, years):
    labels = []
    y = []
    for area in counts:
        labels.append(area)
        y.append(np.array(counts[area]))
    fig, ax = plt.subplots(figsize=(10, 7))
    colors = sns.color_palette("hls", len(counts.keys()))
    ax.set_prop_cycle('color', colors)
    ax.stackplot(years, y, baseline="sym", labels=labels)
    ax.legend(loc='upper left')

dept_list = [
    'Peking University',
    'University of Waterloo',
    'University of Toronto',
    'Stanford University',
    'Tsinghua University',
    'Massachusetts Institute of Technology',
    'Carnegie Mellon University'
]
@interact
def school(name=dept_list, topk=(1, 20, 1)):
    school_df = all_info.loc[all_info['dept']==name]
    year_list = list(range(2010, 2023))
    all_year_info = []
    for year in year_list:
        all_year_info.append(count_area_year_pub(school_df, year))
    counts = get_topk_count(all_year_info, topk)
    plot_streamgraph(counts, year_list)

interactive(children=(Dropdown(description='name', options=('Peking University', 'University of Waterloo', 'Un…

In [5]:
dept_list = [
    'Peking University',
    'University of Waterloo',
    'University of Toronto',
    'Stanford University',
    'Tsinghua University',
    'Massachusetts Institute of Technology',
    'Carnegie Mellon University'
]
area_list = [
    'cvpr',
    'emnlp',
    'nips',
    'aaai'
]
@interact
def school_area(name=dept_list, area=area_list, year=(2010, 2022, 1)):
    school_df = all_info.loc[all_info['dept']==name]
    school_year_df = school_df.loc[school_df['year']==year]
    school_year_area_df = school_year_df.loc[school_year_df['area']==area]
    fig = px.pie(school_year_area_df, values='count', names='name')
    fig.show()

interactive(children=(Dropdown(description='name', options=('Peking University', 'University of Waterloo', 'Un…

## Faculty Level Streamgraph

In [6]:
faculty_list = [
    'Jimmy Lin',
    'Yaoliang Yu',
    'Graham Neubig',
    'Heng Ji',
    'Noah A. Smith',
    'Daniel Vogel 0001'
]
@interact
def faculty(name=faculty_list, topk=(1, 20, 1)):
    faculty_df = all_info.loc[all_info['name']==name]
    year_list = list(range(2010, 2023))
    all_year_info = []
    for year in year_list:
        all_year_info.append(count_area_year_pub(faculty_df, year))
    counts = get_topk_count(all_year_info, topk)
    plot_streamgraph(counts, year_list)

interactive(children=(Dropdown(description='name', options=('Jimmy Lin', 'Yaoliang Yu', 'Graham Neubig', 'Heng…

## Faculty Level Word Cloud

In [7]:
def word_cloud(df, year):
    df_year = df.loc[df['year']==year]
    all_titles = []
    for _, info in df_year.iterrows():
        all_titles.append(info['all_pub'])
    text = " ".join(all_titles).lower()
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(text)
    plt.figure(figsize = (5, 5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()
    print(all_titles)
    
@interact
def faculty_word_cloud(name=faculty_list, year=(2010, 2022, 1)):
    faculty_df = all_info.loc[all_info['name']==name]
    return word_cloud(faculty_df, year)

interactive(children=(Dropdown(description='name', options=('Jimmy Lin', 'Yaoliang Yu', 'Graham Neubig', 'Heng…