In [47]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

pio.templates.default = "seaborn"

In [48]:
meta_data = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")[["Pub_id", "Survey", "Topic"]]

df = pd.merge(core_pubs, meta_data, left_on="Pub_id", right_on="id")
surveys = meta_data[meta_data["id"].isin(df["Survey"].unique())][["id", "title", "year", "times_cited", "field_citation_ratio"]]
core_pubs = df[["Survey", "Pub_id", "Topic", "abstract", "times_cited", "field_citation_ratio", "year"]]

# add to each topic a (year of the survey)
survey_years = core_pubs.apply(lambda row: surveys[surveys["id"] == row["Survey"]]["year"].values[0], axis=1).values
core_pubs.loc[:, "Topic"] = core_pubs["Topic"] + " (" + survey_years.astype(str) + ")"

In [39]:
surveys

Unnamed: 0,id,title,year,times_cited,field_citation_ratio
0,pub.1175782890,Current progress of perovskite solar cells sta...,2024,0,
4,pub.1163752603,Greening warehouses through energy efficiency ...,2023,9,
5,pub.1156021846,Sustainable biofuel economy: A mapping through...,2023,56,
7,pub.1147958699,Drones in agriculture: A review and bibliometr...,2022,249,127.77
18,pub.1130187939,Crop yield prediction using machine learning: ...,2020,867,279.84
20,pub.1129627241,SYSTEMATIC LITERATURE REVIEW OF TOURISM GROWTH...,2020,52,18.77
24,pub.1127137176,Big impact of nanoparticles: analysis of the m...,2020,69,64.64
58,pub.1103814815,The application of internet of things in healt...,2018,215,47.94
306,pub.1159788042,A Bibliometric Analysis of the Top 100 Most In...,2023,5,
316,pub.1146747333,A Systematic Literature Review on Distributed ...,2022,41,19.68


In [49]:
# histogram for number of publications per survey

fig = px.histogram(core_pubs, x="Topic", title="Number of publications per topic (survey year)", labels={"Topic": "Topic (survey year)"})
fig.update_layout(xaxis_title="Topic", yaxis_title="Number of publications", xaxis_categoryorder="total descending")
fig.show()


In [50]:
# explore citation distribution per topic
fig = px.box(core_pubs, x="Topic", y="times_cited", title="Citation distribution per topic", points="all", height=450)
fig.update_layout(xaxis_title="Topic", yaxis_title="Number of citations")
fig.show()

In [66]:
years_plot_df = core_pubs.copy()
years_plot_df.loc[years_plot_df["year"] < 1980, "year"] = 1980
fig = px.box(years_plot_df, x="Topic", y="year", title="Year distribution per topic", points="all", height=450)
fig.update_layout(xaxis_title="Topic", yaxis_title="Year")
fig.show()