# Visualize results

In [1]:
import pandas as pd
import plotly.express as px

df = pd.read_json('aggregated_results_simplified.json')

In [2]:
df.head

<bound method NDFrame.head of     source     gender  count
0    20min   weiblich   1882
1    20min   männlich   4580
2    20min  unbekannt    978
3   Watson   weiblich   3745
4   Watson   männlich  13979
5   Watson  unbekannt   2655
6    Blick   weiblich  15805
7    Blick   männlich  46981
8    Blick  unbekannt  10499
9      SRF   weiblich   6888
10     SRF   männlich  21951
11     SRF  unbekannt   3500>

In [9]:
# color_discrete_sequence = ['navy','darkorange', 'indianred', 'teal']

# Prism (blue, green)
# ,color_discrete_sequence=px.colors.qualitative.Prism
# https://plotly.com/python/discrete-color/

#, color="source"


fig = px.histogram(df, y="source", x="count", text_auto=True, color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Anzahl gefundene Zitate pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Anzahl Zitate',
        yaxis={'categoryorder':'total ascending'},
        showlegend=False,
        font=dict(
                size=22
        )
)

config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 600,
    'width': 1400,
    'scale':2
   # Multiply title/legend/axis/canvas sizes by this factor
  }
}

fig.show(config=config)

In [23]:
# https://plotly.com/python/discrete-color/

# setting up collor plate
my_own_color_discrete_sequence=[
                 px.colors.qualitative.Bold[3],
                 px.colors.qualitative.Bold[2],
               px.colors.qualitative.Bold[0]]

# blau, gelb, violett
my_colors_2 = ["#41748c", "#fbc707", "#7d5bbe"] #v1
my_colors = ["#ffcb3e", "#4b647d", "#7b73a6"] #v2 # gelb, blau, violett

# v2 violett
# #a463bf oder #8e6bba
# #7b73a6 gut weil gräulich und geht ja um male, female und nicht undefined
# #a63f97

# v1 violett
# "#7f3c8d" Violett aus px.colors.qualitative.Bold
# helles grau/violett #b19cd9
# kräfties violett #6c2dc7
# 

# color_discrete_sequence=px.colors.qualitative.Bold

In [5]:
# Sortieren nach prozentualen Anteilen
sorted_df = df.sort_values(['source', 'count'], ascending=[True, False])

fig = px.histogram(sorted_df, x="source", y="count",
             color='gender', barmode='group', text_auto=True
             ,color_discrete_sequence=my_colors,
labels={'gender': 'Geschlecht'})
fig.update_layout(
        title='Zitate gruppiert nach Geschlecht und Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Anzahl Zitate',
        xaxis_title_text='Nachrichtenportal',
        font=dict(
                size=17
        )
)
fig.show()

In [6]:
fig = px.histogram(sorted_df, x="source", y="count", color="gender", text_auto=True
,color_discrete_sequence=my_colors,
labels={'gender': 'Geschlecht'})
fig.update_layout(
        title='Kummulierte Anzahl gefundener Zitate pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Summe von Zitaten',
        xaxis_title_text='Nachrichtenportal',
        bargap=0.3,
        font=dict(
                size=17
        )
)
fig.show()

In [7]:
# Prozentuall und nicht absolute Zahlen stacked

In [8]:
df_mean = pd.read_json('aggregated_results_simplified_with_mean.json')

# Summen pro Newsportal berechnen
summen = df_mean.groupby(['source', 'gender'])['count'].sum().reset_index()

# Prozentuale Anteile berechnen
summe = summen.groupby('source')['count'].transform('sum')
summen['prozent'] = summen['count'] / summe * 100

# Sortieren nach prozentualen Anteilen
summen = summen.sort_values(['source', 'prozent'], ascending=[True, False])

# Histogramm mit prozentualen Balken erstellen
fig = px.histogram(summen, x="source", y="prozent", color="gender", 
                   color_discrete_sequence=my_colors, 
                   title='Prozentuale Anzahl Zitate pro Nachrichtenportal',
                   labels={'prozent': 'Prozent', 'source': 'Nachrichtenportal', 'gender': 'Geschlecht'},
                   histfunc='sum')

# Prozentsätze in den Balken anzeigen
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside') # In Prozent anzeigen


# fig.update_yaxes(showticklabels=False, title=None, showgrid=False) # y achsen bezeichnung, Grid und zalen wegmachen
fig.update_layout(bargap=0.3, font=dict(size=22),
                  title={'text': 'Prozentuale Anzahl Zitate pro Nachrichtenportal', 'x': 0.5},
                  yaxis={'showticklabels': True, 'title': 'Prozentuale Anzahl der Zitate'},
                  legend_traceorder="reversed")

config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 600,
    'width': 1400,
    'scale':2
   # Multiply title/legend/axis/canvas sizes by this factor
  }
}

fig.show(config=config)

In [9]:
data = [
    {"source": "", "gender": "weiblich", "count": 28320},
    {"source": "", "gender": "männlich", "count": 87491},
    {"source": "", "gender": "unbekannt", "count": 17632}
]

# DataFrame erstellen
df_just_mean = pd.DataFrame(data)

# Summen pro Newsportal berechnen
summen = df_just_mean.groupby(['source', 'gender'])['count'].sum().reset_index()

# Prozentuale Anteile berechnen
summe = summen.groupby('source')['count'].transform('sum')
summen['prozent'] = summen['count'] / summe * 100

# Sortieren nach prozentualen Anteilen
summen = summen.sort_values(['source', 'prozent'], ascending=[True, False])

# Histogramm mit prozentualen Balken erstellen
fig = px.histogram(summen, x="source", y="prozent", color="gender", 
                   color_discrete_sequence=my_colors, 
                   title='Anzahl Zitate prozentual pro Nachrichtenportal',
                   labels={'prozent': 'Prozent', 'source': '', 'gender': 'Geschlecht'},
                   histfunc='sum')

# Prozentsätze in den Balken anzeigen
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside') # In Prozent anzeigen

# fig.update_yaxes(showticklabels=False, title=None, showgrid=False) # y achsen bezeichnung, Grid und zalen wegmachen
fig.update_layout(bargap=0.3, font=dict(size=17),
                  title={'text': 'Anzahl Zitate prozentual pro Geschlecht', 'x': 0.5},
                  yaxis={'showticklabels': True, 'title': 'Prozetuale Anzahl aller Zitate'},
                  legend=dict(orientation="h", x=-0.05, traceorder="reversed"))

fig.show()

In [10]:
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 500,
    'width': 600,
    'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

# Gender Gap Diagram

In [10]:
data = [
    {"source": "20min", "count": 41.77},
    {"source": "Watson", "count": 57.70},
    {"source": "Blick", "count": 49.59},
    {"source": "SRF", "count": 52.24},
    {"source": "Mittelwert", "count": 50.34}
]

# DataFrame erstellen
df_gender_gap = pd.DataFrame(data)

fig = px.histogram(df_gender_gap, y="source", x="count", text_auto=True, color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Gender Gap pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Gender Gap',
        yaxis={'categoryorder':'total ascending'},
        showlegend=False,
        font=dict(
                size=22
        )
)
fig.show(config=config)

In [12]:
data = [
    {"source": "20min", "count": 41.77},
    {"source": "Watson", "count": 57.70},
    {"source": "Blick", "count": 49.59},
    {"source": "SRF", "count": 52.24},
    {"source": "Mittelwert", "count": 50.34}
]

# DataFrame erstellen
df_gender_gap = pd.DataFrame(data)

df = pd.DataFrame(dict(product=['A', 'B', 'C'],
                       value=[2000, 1400, 2300]))

fig = px.bar(df_gender_gap, y='source', x='count', text='count', orientation='h', color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Gender Gap pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Gender Gap',
        yaxis={'categoryorder':'total ascending'},
        showlegend=False,
        font=dict(
                size=22
        )
)
fig.update_traces(text=[f'{val}\u0025' for val in df_gender_gap['count']])

fig.show(config=config)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



# Anzahl Artikel / Anzahl gefundene Zitate --> pro Portal mit Balkendiagramm darstellen

In [13]:
# Anzahl Artikel / Anzahl gefundene Zitate --> pro Portal mit Balkendiagramm darstellen

In [14]:
# citations/articles

df_2 = pd.read_json('aggregated_articles_citations.json')

In [15]:
fig = px.histogram(df_2, y="source", x="citations_per_article", text_auto=True, color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Anzahl Zitate pro Artikel pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Anzahl Zitate pro Artikel',
        yaxis={'categoryorder':'total ascending'},
        showlegend=False,
        font=dict(
                size=22
        )
)
fig.show(config=config)

# Anzahl Clean Articles

In [17]:
df_3 = pd.read_json('sum_clean_articles_count.json')

In [18]:
fig = px.histogram(df_3, y="source", x="count", text_auto=True, color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Anzahl gefundene Artikel pro Nachrichtenportal ohne Duplikate',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Anzahl Artikel',
        yaxis={'categoryorder':'total ascending'},
        showlegend=False,
        font=dict(
                size=22
        )
)
fig.show(config=config)

# Artikel unbereinigt vs bereinigt

In [19]:
df_clean = pd.read_json('sum_clean_articles_count.json')
df_dirty = pd.read_json('sum_dirty_articles_count.json')
df_clean

Unnamed: 0,source,count
0,20min,20487
1,Watson,62381
2,Blick,194881
3,SRF,73272


In [20]:
df_merged = pd.merge(df_clean,df_dirty,on='source', how='inner')

In [21]:
df_merged.rename(columns = {"count_x": "cleaned", "count_y": "dirty"}, inplace=True)
df_merged

Unnamed: 0,source,cleaned,dirty
0,20min,20487,20838
1,Watson,62381,77276
2,Blick,194881,197311
3,SRF,73272,76228


In [30]:
import plotly.graph_objects as go

# Extract the group labels and values from the DataFrame
group_labels = df_merged['source'].tolist()
values_2 = df_merged['cleaned'].tolist()
values_1 = df_merged['dirty'].tolist()

# Create the bar chart traces
trace1 = go.Bar(
    x=group_labels,
    y=values_1,
    name='Mit Duplikaten',
    text=values_1,
    textposition='auto',
    marker=dict(color=my_colors[1])
)
trace2 = go.Bar(
    x=group_labels,
    y=values_2,
    name='Ohne Duplikate',
    text=values_2,
    textposition='auto',
    marker=dict(color=my_colors[0])
)

# Define the layout for the chart
layout = go.Layout(
    title='Anzahl Artikel mit und ohne Duplikate gruppiert nach Nachrichtenportal',
    title_x=0.5,
    xaxis_title_text='Nachrichtenportal',
    yaxis_title_text='Anzahl Artikel',
    yaxis={'categoryorder':'total ascending'},
    font_size=22,
    barmode='group',
    legend=dict(orientation="h", x=0.3, y = -0.2)
)

# Combine the traces and layout into a figure and display it
data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)

config2 = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 800,
    'width': 1400,
    'scale':2
   # Multiply title/legend/axis/canvas sizes by this factor
  }
}

fig.show(config=config2)

# Anzahl eliminierte Duplikate

In [31]:
df_merged['Unterschied'] = df_merged['dirty'] - df_merged['cleaned']
# DataFrame erstellen
df_gender_gap = pd.DataFrame(data)

fig = px.histogram(df_merged, y="source", x="Unterschied", text_auto=True, color_discrete_sequence=['#4b647d'])
fig.update_layout(
        title='Anzahl eliminierter Duplikate pro Nachrichtenportal',
        title_x=0.5,
        yaxis_title_text='Nachrichtenportal',
        xaxis_title_text='Anzahl eliminierter Duplikate',
        yaxis={'categoryorder':'total descending'},
        showlegend=False,
        font=dict(
                size=22
        )
)
fig.show(config=config)