#### The goal of this notebook is to get a breif understanding of the data we are using and the differnece of quality between the Systematic Literature Review and Bibliometric Analysis topics.

In [1]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

pio.templates.default = "seaborn"
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=20, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=12, yaxis_tickfont_size=12, legend_font_size=15,
    xaxis_title_font_size=17, yaxis_title_font_size=17, xaxis_tickangle=45,
    width=650, height=400
    # legend_orientation="h", legend_yanchor="bottom", legend_y=-0.3, legend_xanchor="center", legend_x=0.5
    )
SLR_TOPICS = ["Software Process Line", "Data Stream Processing Latency", "Business Process Meta Models", "Multicore Performance Prediction", "Cloud Migration", "Software Fault Prediction Metrics", "Software Defect Prediction"]

In [2]:
meta_data = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")[["Pub_id", "Survey", "Topic"]]

df = pd.merge(core_pubs, meta_data, left_on="Pub_id", right_on="id")
surveys = meta_data[meta_data["id"].isin(df["Survey"].unique())][["id", "title", "year", "times_cited", "field_citation_ratio"]]
core_pubs = df[["Survey", "Pub_id", "Topic", "abstract", "times_cited", "field_citation_ratio", "year"]]
# add column that marks as SLR or BA
# mark as SLR if the topic is in the SLR_TOPICS
core_pubs["Type"] = "Bibliometric Analysis"
core_pubs.loc[core_pubs["Topic"].isin(SLR_TOPICS), "Type"] = "Systematic Literature Review"
def try_get_year(survey_id):
    try:
        return surveys[surveys["id"] == survey_id]["year"].values[0]
    except:
        return "2015"
survey_years = core_pubs.apply(lambda row: try_get_year(row["Survey"]), axis=1).values
core_pubs.loc[:, "Topic"] = core_pubs["Topic"] + " (" + survey_years.astype(str) + ")"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  core_pubs["Type"] = "Bibliometric Analysis"


In [3]:
# histogram for number of publications per survey

fig = px.histogram(core_pubs, x="Topic", title="Number of publications per topic", color="Type")
fig.update_layout(xaxis_categoryorder="total descending", **PLOT_CONFIGS)
fig.update_layout(width=1000, height=500)
fig.update_xaxes(title_text="Topic (survey year)")
fig.update_yaxes(title_text="Number of core publications")
fig.show()
# fig.write_image("./LitQEval-report/pics/dataset-overview.pdf")


In [None]:
# explore citation distribution per topic
fig = px.box(core_pubs, x="Topic", y="times_cited", title="CPs Citation Count",
              points="all", color="Type")
fig.update_layout(**PLOT_CONFIGS)
fig.update_traces(marker_size=5)
fig.update_layout(xaxis_title_text="Topic", yaxis_title_text="Number of citations", title_font_size=30,
                  legend_font_size=20, yaxis_tickfont_size=20, xaxis_tickfont_size=20, title_x=0.42,
                  xaxis_title_font_size=25, yaxis_title_font_size=25, width=1300, height=800)
fig.show()
# fig.write_image("./LitQEval-report/pics/citation-distribution.pdf")

In [21]:
years_plot_df = core_pubs.copy()
years_plot_df.loc[years_plot_df["year"] < 1980, "year"] = 1980
fig = px.box(years_plot_df, x="Topic", y="year",
             title="Publishing Date Distribution of the CPs", points="all", height=450,
                color="Type", hover_data=["Pub_id"])
markers = [2012, 2015, 2018, 2022, 2023, 2018, 2020, 2023, 2020, 2024,
           2023, 2018, 2022, 2020, 2019, 2019, 2021, 2017, 2014, 2013, 2015]
# add X markers at the corresponding y each to the next x
showlegend = True
for i, marker in enumerate(markers):
    if i == 1:
        showlegend = False
    fig.add_shape(type="line", x0=i - 0.5, x1=i, showlegend=showlegend,
                   y0=marker, y1=marker,
                     line=dict(color="red", width=1), name="Survey year")
fig.update_layout(**PLOT_CONFIGS)
fig.show()
fig.update_traces(marker_size=5)
fig.update_layout(xaxis_title_text="Topic", yaxis_title_text="Year", title_font_size=30,
                  legend_font_size=20, yaxis_tickfont_size=20, xaxis_tickfont_size=20, title_x=0.25,
                  xaxis_title_font_size=25, yaxis_title_font_size=25, width=1300, height=800)
# fig.write_image("./LitQEval-report/pics/year-distribution.pdf")