# Vizualization Project

### Research Paper Impact (Citations) and Its Artifact Usage (Downloads)

The goal of this study is to find out if the artifacts produced by research papers (e.g. Python Packages) correlate with the impact of a particular research paper in terms of its citations. In particular, this study aims to answer the following three questions:

1. *Does increase in download counts of a particular artifiact indicate the rise in impact (citations) of that particular paper in the research community?*

2. *Conversely, does decrease in download counts of a particular artifiact indicate the decline in impact (citations) of that particular paper in the research community?*

3. *Finally, Is the opposite true? In other words, the higher the impact (citations) of a particular research paper, the greater the download count of its artifact?*


### Data Sources

1. [Google BigQuery PyPI Downloads Table](https://packaging.python.org/en/latest/guides/analyzing-pypi-package-downloads/) 

2. [Google Scholar (Scraping)](https://scholar.google.ca/)

3. [Semantic Scholar Paper Lookup API](https://www.semanticscholar.org/product/api)


### Artifacts Used

1. Python Package: pyserini 
    - Paper: *Pyserini: A Python toolkit for reproducible information retrieval research with sparse and dense representations*
2. Python Package: CoreNLP
    - Paper: *The Stanford CoreNLP natural language processing toolkit*
3. Python Package: PyTorch
    - Paper: *Pytorch: An imperative style, high-performance deep learning library*

In [144]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [182]:
package = "Pyserini"
timeline = "Yearly"

In [213]:
def getDownloadCounts(filename, timeline):
    df_d_counts = pd.read_csv(filename)
    df_d_counts["month"] = pd.to_datetime(df_d_counts["month"])
    df_d_counts = df_d_counts.sort_values(by="month")
    df_d_counts.rename(columns = {'num_downloads':'Download Counts'}, inplace = True)
    df_d_counts.rename(columns = {'month':'Month'}, inplace = True)
    if(timeline == 'Yearly'):
        df_d_counts = df_d_counts.groupby(df_d_counts['Month'].dt.year)['Download Counts'].agg(['sum']).reset_index()
        df_d_counts.rename(columns = {'Month':'Year'}, inplace = True)
        df_d_counts.rename(columns = {'sum':'Download Counts'}, inplace = True)
    return df_d_counts

def getCitationCounts(filename, timeline):
    df_c_counts = pd.read_csv(filename)
    df_c_counts["Month"] = pd.to_datetime(df_c_counts["Month"])
    df_c_counts = df_c_counts.sort_values(by="Month")
    if(timeline == 'Yearly'):
        df_c_counts = df_c_counts.groupby(df_c_counts['Month'].dt.year)['Citation Counts'].agg(['sum']).reset_index()
        df_c_counts.rename(columns = {'Month':'Year'}, inplace = True)
        df_c_counts.rename(columns = {'sum':'Citation Counts'}, inplace = True)
    
    return df_c_counts

def getAll(package_key, fNCitationCounts, fNDownloadCounts):

    total_citations = []
    total_downloads = []

    for package in package_key.keys():
        fn_citations = fNCitationCounts[package_key[package]]
        fn_downloads = fNDownloadCounts[package_key[package]]
        total_citations.append((package,pd.read_csv(fn_citations)['Citation Counts'].sum()))
        total_downloads.append((package,pd.read_csv(fn_downloads)['num_downloads'].sum()))

    total_citations.sort(key = lambda x: x[1], reverse=True)
    total_downloads.sort(key = lambda x: x[1], reverse=True)
    c = [[],[]]
    for i in range(2):
        for j in range(len(total_citations)):
            c[i].append(total_citations[j][i])

    d = [[],[]]
    for i in range(2):
        for j in range(len(total_downloads)):
            d[i].append(total_downloads[j][i])
    
    return (d,c)

def plotLineGraph(df, x_name, y_name, title, timeline):
    fig = make_subplots(rows=2, cols=1)
    fig = px.line(df, x=x_name, y=y_name, title=title)
    
    if(timeline == 'Monthly'):
        fig.update_xaxes(dtick="M3", tickformat="%b\n%Y")
    
    fig.update_annotations(font_size=23)
    fig.update_xaxes(rangeslider_visible=True)
    fig.update_yaxes(title_font=dict(size=20), tickfont = dict(size=20))
    fig.update_xaxes(title_font=dict(size=20), tickfont = dict(size=15))
    fig.update_layout(title_x=0.5,title_font=dict(size=35))

    fig.show()


def plotTable(total_counts):
    fig = make_subplots(rows=2, cols=1,specs=[[{"type": "table"}],
                           [{"type": "table"}]])
    
    fig.add_trace(go.Table(
        header=dict(values=['Package Name', ' Total Download Counts'],
                    line_color='darkslategray',
                    fill_color='royalblue',
                    align='center',
                    font_size=24,
                    height=30,
                    font=dict(color='white', size=15)),
        cells=dict(values=[total_counts[0][0], # 1st column
                           total_counts[0][1]], # 2nd column
                   line_color='darkslategray',
                   fill_color='white',
                   align='center',
                   font_size=22,
                   height=30),
                  ),row = 1, col = 1)
    
    fig.add_trace(go.Table(
        header=dict(values=['Package Name', 'Total Citation Counts'],
                    line_color='darkslategray',
                    fill_color='royalblue',
                    align='center',
                    font_size=24,
                    height=30,
                    font=dict(color='white', size=15)),
        cells=dict(values=[total_counts[1][0], # 1st column
                           total_counts[1][1]], # 2nd column
                   line_color='darkslategray',
                   fill_color='white',
                   align='center',
                   font_size=22,
                   height=30),
                  ),row = 2, col = 1)
    
    fig.update_layout(width=700)
    fig.show()


In [214]:
def main(package, timeline):
    
    timeline_header_names = ['Year', 'Month']
    fNCitationCounts = ['coreNLP_citation_counts.csv', 'pyserini_citation_counts.csv', 'pyTorch_citation_counts.csv']
    fNDownloadCounts = ['coreNLP_download_counts.csv', 'pyserini_download_counts.csv', 'pyTorch_download_counts.csv']
    package_key = {'Pyserini': 1, 'CoreNLP': 0, 'PyTorch': 2}
    timeline_key = {'Yearly': 0, 'Monthly': 1}
    
    if(package != 'All'):
        d_fname = fNDownloadCounts[package_key[package]]
        c_fname = fNCitationCounts[package_key[package]]
        h_name = timeline_header_names[timeline_key[timeline]]
        df_download = getDownloadCounts(d_fname, timeline)
        df_citations = getCitationCounts(c_fname, timeline)
        plotLineGraph(df_download, h_name, 'Download Counts', 'Download Counts Over Time', timeline)
        plotLineGraph(df_citations, h_name, 'Citation Counts', 'Citation Counts Over Time', timeline)
    
    else:
        
        total_counts = getAll(package_key, fNCitationCounts, fNDownloadCounts)
        plotTable(total_counts)

In [215]:
main(package,timeline)