In [None]:
import pickle
import pandas as pd
from utils.data_tools import flatten

In [None]:
# get pickle from ./data/keyword_paper_dict.pkl
with open('./data/keyword_paper_dict.pkl', 'rb') as f:
    keyword_paper_dict = pickle.load(f)

In [None]:
# data cleaning

# combine keywords like "qos" and "qos," into "qos", same for other cases
cleaned_keyword_paper_dict = {}
for key in keyword_paper_dict.keys():
    if key[-1] == ",":
        new_key = key[:-1]
        if new_key in cleaned_keyword_paper_dict.keys():
            cleaned_keyword_paper_dict[new_key] = pd.concat([cleaned_keyword_paper_dict[new_key], keyword_paper_dict[key]])
            continue
    cleaned_keyword_paper_dict[key] = keyword_paper_dict[key]

## Number of Papers per Keyword

In [None]:
keyword_paper_number = {key: len(cleaned_keyword_paper_dict[key]) for key in cleaned_keyword_paper_dict.keys()}

In [None]:
# build histogram
import plotly.express as px
import pandas as pd

show_top = 30

df = pd.DataFrame.from_dict(keyword_paper_number, orient='index').reset_index()
df.columns = ['keyword', 'count']
df = df.sort_values(by='count', ascending=False)
fig = px.bar(df.head(show_top), x='keyword', y='count')
fig.update_layout(title=f"Number of Papers found per Keyword<br /><sub>Showing top {show_top} keywords of {len(df)}. Maximum is 10k papers (didn't get more...)</sub>", xaxis_title="Keyword", yaxis_title="Number of Papers")

fig.show()


In [None]:
all_paper_dict = pd.DataFrame()
for key in keyword_paper_dict.keys():
    all_paper_dict = pd.concat([all_paper_dict, keyword_paper_dict[key]])
len_before_removing_duplicates = len(all_paper_dict)
all_paper_dict = all_paper_dict.drop_duplicates(subset="DOI") # remove duplicates based on DOI
len_after_removing_duplicates = len(all_paper_dict)
all_paper_dict = all_paper_dict.sort_values(by="is-referenced-by-count", ascending=False).reset_index(drop=True)
print(f"Removed {len_before_removing_duplicates - len_after_removing_duplicates} duplicates. {len_after_removing_duplicates} papers left.")
all_paper_dict.head()

In [None]:
# build histogram
import plotly.express as px
import pandas as pd

show_top = 30

df = pd.DataFrame.from_dict(keyword_counter, orient='index').reset_index()
df.columns = ['keyword', 'count']
df = df.sort_values(by='count', ascending=False)
fig = px.bar(df.head(show_top), x='keyword', y='count')
fig.update_layout(title=f"Popularity of Keywords<br /><sub>Showing top {show_top} keywords of {len(df)}</sub>", xaxis_title="Keyword", yaxis_title="Number of Papers")

### Lets see how the number of keyword is distributed

In [None]:
# build violin plot and a boxplot for comparison
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Violin(y=df['count'], box_visible=True, line_color='blue', meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, name='Violin'), row=1, col=1)
fig.add_trace(go.Box(y=df['count'], boxpoints='all', jitter=0.3, pointpos=-1.8, name='Boxplot (Logarithmic Scale)'), row=1, col=2)

fig.update_layout(title="Distribution of Number of Papers per Keyword", yaxis_title="Number of Papers")

# make boxplot logarithmic
fig.update_yaxes(type="log", row=1, col=2)

fig.show()