In [None]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

pio.templates.default = "seaborn"
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=35, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=15,
    xaxis_title_font_size=20, yaxis_title_font_size=20,
    # legend_orientation="h", legend_yanchor="bottom", legend_y=-0.3, legend_xanchor="center", legend_x=0.5
    )
SLR_TOPICS = ["Software Process Line", "Data Stream Processing Latency", "Business Process Meta Models", "Multicore Performance Prediction", "Cloud Migration", "Software Fault Prediction Metrics", "Software Defect Prediction"]

In [84]:
meta_data = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")[["Pub_id", "Survey", "Topic"]]

df = pd.merge(core_pubs, meta_data, left_on="Pub_id", right_on="id")
surveys = meta_data[meta_data["id"].isin(df["Survey"].unique())][["id", "title", "year", "times_cited", "field_citation_ratio"]]
core_pubs = df[["Survey", "Pub_id", "Topic", "abstract", "times_cited", "field_citation_ratio", "year"]]
# add column that marks as SLR or BA
# mark as SLR if the topic is in the SLR_TOPICS
core_pubs["Type"] = "Bibliometric Analysis"
core_pubs.loc[core_pubs["Topic"].isin(SLR_TOPICS), "Type"] = "Systematic Literature Review"
def try_get_year(survey_id):
    try:
        return surveys[surveys["id"] == survey_id]["year"].values[0]
    except:
        return "2015"
survey_years = core_pubs.apply(lambda row: try_get_year(row["Survey"]), axis=1).values
core_pubs.loc[:, "Topic"] = core_pubs["Topic"] + " (" + survey_years.astype(str) + ")"



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [85]:
# histogram for number of publications per survey

fig = px.histogram(core_pubs, x="Topic", title="Number of publications per topic (survey year)",
                    labels={"Topic": "Topic (survey year)"}, color="Type")
fig.update_layout(xaxis_categoryorder="total descending", **PLOT_CONFIGS)
fig.update_xaxes(title_text="Topic")
fig.update_yaxes(title_text="Number of publications")
fig.show()


In [86]:
# explore citation distribution per topic
fig = px.box(core_pubs, x="Topic", y="times_cited", title="Citation distribution per topic",
              points="all", height=450, color="Type")
fig.update_layout(**PLOT_CONFIGS)
fig.update_xaxes(title_text="Topic")
fig.update_yaxes(title_text="Number of citations")
fig.show()

In [87]:
years_plot_df = core_pubs.copy()
years_plot_df.loc[years_plot_df["year"] < 1980, "year"] = 1980
fig = px.box(years_plot_df, x="Topic", y="year",
             title="Year distribution per topic", points="all", height=450,
                color="Type")
markers = [2012, 2015, 2018, 2022, 2023, 2018, 2020, 2023, 2020, 2024,
           2010, 2023, 2018, 2022, 2020, 2019, 2019, 2021, 2017, 2014, 2013, 2015]
# add X markers at the corresponding y each to the next x
showlegend = True
for i, marker in enumerate(markers):
    if i == 1:
        showlegend = False
    fig.add_shape(type="line", x0=i - 0.5, x1=i, showlegend=showlegend,
                   y0=marker, y1=marker,
                     line=dict(color="red", width=2), name="Survey year")
fig.update_layout(**PLOT_CONFIGS)
fig.update_layout(legend_orientation=None, legend_yanchor=None,
                   legend_y=None, legend_xanchor=None, legend_x=None,
                   legend_font_size=15)
fig.update_xaxes(title_text="Topic")
fig.update_yaxes(title_text="Year")
fig.show()