In [None]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# year = '2020'
# encoding='ISO-8859-1'

In [None]:
year = '2021'
encoding = 'UTF-8'

In [None]:
pd.read_csv(os.path.join('..', 'mundus', 'data', f'mundus_article_{year}.csv'), sep=';', encoding=encoding).head()

In [None]:
df = pd.read_csv(os.path.join('..', 'mundus', 'data', f'mundus_article_{year}.csv'), sep=';', encoding=encoding)
df['date'] = pd.to_datetime(df['date'])
df.sort_values('views', ascending=False, inplace=True)
df.fillna('Divers', inplace=True)
df['count'] = 1
df.head()

In [None]:
len(df)

# Jeu

In [None]:
len(df['game'].unique())

In [None]:
df2 = df.groupby('game_type').sum()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df2.index, values=df2['count'], textinfo='value+percent'), 1, 1)
fig.add_trace(go.Pie(labels=df2.index, values=df2['views'], textinfo='value+percent'), 1, 2)


fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text=f'Statistiques {year} par famille de jeu', separators = ', .',
    annotations=[dict(text='Articles', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Vues', x=0.81, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
min_post_nb = 15
df['game2'] = 'Autres jeux'
post_nb_by_game = df.groupby('game').count()
popular_games = post_nb_by_game.index[(post_nb_by_game >= min_post_nb).transpose().any()]
for game in popular_games:
    df.loc[df['game'] == game, 'game2'] = game
df2 = df.groupby('game2').sum()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df2.index, values=df2['count'], textinfo='value+percent'), 1, 1)
fig.add_trace(go.Pie(labels=df2.index, values=df2['views'], textinfo='value+percent'), 1, 2)


fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text=f'Statistiques {year} des jeux à {min_post_nb} articles ou plus', separators = ', .',
    annotations=[dict(text='Articles', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Vues', x=0.81, y=0.5, font_size=20, showarrow=False)])
fig.show()

# Views by game

In [None]:
fig = px.box(df[df['game_type'] == 'Grande Stratégie'], x='game', y='views',
             title=f'Vues des articles {year} de Grande Stratégie')
fig.show()

In [None]:
fig = px.box(df[df['game_type'] == 'Total War'], x='game', y='views',
             title=f'Vues des articles {year} de Total War')
fig.show()

In [None]:
article_by_game = df.groupby('game').count()
df_2_or_more_games = df[df['game'].isin(article_by_game.index[(article_by_game > 2).transpose().any()])]
fig = px.box(df_2_or_more_games[df_2_or_more_games['game_type'] == 'Gestion'], x='game', y='views',
             title=f'Vues des articles {year} de Gestion (jeux avec au moins 3 articles)')
fig.show()

# Rédacteurs

In [None]:
len(df['author'].unique())

In [None]:
df3 = df.groupby('author').sum()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df3.index, values=df3['count'], textinfo='value+percent'), 1, 1)
fig.add_trace(go.Pie(labels=df3.index, values=df3['views'], textinfo='value+percent'), 1, 2)


fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text=f'Statistiques {year} par rédacteur', separators = ', .',
    annotations=[dict(text='Articles', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Vues', x=0.81, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
fig = px.box(df, x='author', y='views', title=f'Vues des articles {year} par rédacteur')
fig.show()

# Jour

In [None]:
df['day_of_year'] = df['date'].dt.dayofyear
df2 = df.groupby('day_of_year').count()['count']
df2 = pd.DataFrame(np.arange(365)).join(df2).fillna(0)
fig = px.histogram(df2, x='count')
fig.update_xaxes(title_text='Nombre d\'articles dans la journée')
fig.update_xaxes(title_text='Nombre de jours')
fig.update_layout(title=dict(text='Histogramme du nombre d\'articles par jours'))

In [None]:
df[df['day_of_year'] == df2[df2['count'] == df2['count'].max()][0].values[0]]

# Year comparison

In [None]:
prev_year = '2020'
prev_encoding = 'ISO-8859-1'

In [None]:
prev_df = pd.read_csv(os.path.join('..', 'mundus', 'data', f'mundus_article_{prev_year}.csv'),
                      sep=';', encoding=prev_encoding)
prev_df['date'] = pd.to_datetime(prev_df['date'])
prev_df.sort_values('views', ascending=False, inplace=True)
prev_df.fillna('Divers', inplace=True)
prev_df['count'] = 1
prev_df.head()

In [None]:
prev_sum = prev_df.groupby('game').sum()
prev_sum['year'] = prev_year
prev_sum.sort_values('views', ascending=False).head()

In [None]:
current_sum = df.groupby('game').sum()
current_sum['year'] = year
del current_sum['day_of_year']
current_sum.sort_values('views', ascending=False).head(10)

In [None]:
all_sum = pd.concat((prev_sum, current_sum))
all_sum.reset_index(inplace=True)
all_sum

In [None]:
# Sort according current best views
current_most_views_game = current_sum.sort_values('views', ascending=False).head(10).index.values
df_to_plot = pd.DataFrame()
for game in current_most_views_game:
    game_stats = all_sum[all_sum['game'] == game]
    if len(game_stats) == 1:
        game_stats = pd.concat((pd.DataFrame(data=[(game, 0, 0, prev_year)],
                                             columns=game_stats.columns), game_stats), axis=0, ignore_index=True)
    df_to_plot = pd.concat((df_to_plot, game_stats))

# Add total
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', prev_df['views'].sum(), 0, prev_year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', df['views'].sum(), 0, year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)

# Plot
fig = px.histogram(df_to_plot, x="game", y="views", color='year', barmode='group',
                   title=f'Evolution du nombre de vues par jeu (top 10 de {year} et total)')
fig.update_xaxes(title_text='Jeu ou Divers')
fig.update_yaxes(title_text='Nombre de vues')
fig.show()

In [None]:
# Sort according prev best views
current_most_views_game = prev_sum.sort_values('views', ascending=False).head(10).index.values
df_to_plot = pd.DataFrame()
for game in current_most_views_game:
    game_stats = all_sum[all_sum['game'] == game]
    if len(game_stats) == 1:
        game_stats = pd.concat((game_stats, pd.DataFrame(data=[(game, 0, 0, year)],
                                             columns=game_stats.columns)), axis=0, ignore_index=True)
    df_to_plot = pd.concat((df_to_plot, game_stats))

# Add total
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', prev_df['views'].sum(), 0, prev_year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', df['views'].sum(), 0, year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)

# Plot
fig = px.histogram(df_to_plot, x="game", y="views", color='year', barmode='group',
                   title=f'Evolution du nombre de vues par jeu (top 10 de {prev_year} et total)')
fig.update_xaxes(title_text='Jeu ou Divers')
fig.update_yaxes(title_text='Nombre de vues')
fig.show()

In [None]:
# Sort according current best posts
current_most_posts_game = current_sum.sort_values('count', ascending=False).head(10).index.values
df_to_plot = pd.DataFrame()
for game in current_most_posts_game:
    game_stats = all_sum[all_sum['game'] == game]
    if len(game_stats) == 1:
        game_stats = pd.concat((pd.DataFrame(data=[(game, 0, 0, prev_year)],
                                             columns=game_stats.columns), game_stats), axis=0, ignore_index=True)
    df_to_plot = pd.concat((df_to_plot, game_stats))

# Add total
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', 0, prev_df['count'].sum(), prev_year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', 0, df['count'].sum(), year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)

# Plot
fig = px.histogram(df_to_plot, x="game", y="count", color='year', barmode='group',
                   title=f'Evolution du nombre d\'articles par jeu (top 10 de {year} et total)')
fig.update_xaxes(title_text='Jeu ou Divers')
fig.update_yaxes(title_text='Nombre d\'articles')
fig.show()

In [None]:
# Sort according prev best posts
current_most_posts_game = prev_sum.sort_values('count', ascending=False).head(10).index.values
df_to_plot = pd.DataFrame()
for game in current_most_posts_game:
    game_stats = all_sum[all_sum['game'] == game]
    if len(game_stats) == 1:
        game_stats = pd.concat((game_stats, pd.DataFrame(data=[(game, 0, 0, year)],
                                             columns=game_stats.columns)), axis=0, ignore_index=True)
    df_to_plot = pd.concat((df_to_plot, game_stats))

# Add total
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', 0, prev_df['count'].sum(), prev_year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)
df_to_plot = pd.concat((df_to_plot, pd.DataFrame(data=[('TOTAL', 0, df['count'].sum(), year)],
                                                 columns=game_stats.columns)), axis=0, ignore_index=True)

# Plot
fig = px.histogram(df_to_plot, x="game", y="count", color='year', barmode='group',
                   title=f'Evolution du nombre d\'articles par jeu (top 10 de {prev_year} et total)')
fig.update_xaxes(title_text='Jeu ou Divers')
fig.update_yaxes(title_text='Nombre d\'articles')
fig.show()

In [None]:
merge_sum = current_sum[['views', 'count']].join(prev_sum[['views', 'count']], how='inner', lsuffix=year, rsuffix=prev_year)
merge_sum['delta_views'] = merge_sum[f'views{year}'] - merge_sum[f'views{prev_year}']
merge_sum.sort_values('delta_views', ascending=False).head(10)

# 2017- X

In [None]:
prev_all_df = pd.read_csv(os.path.join('..', 'mundus', 'data', f'mundus_article_2017_{prev_year}.csv'), encoding='ISO-8859-1')
prev_all_df['date'] = pd.to_datetime(prev_all_df['date'])
prev_all_df.sort_values('views', ascending=False, inplace=True)
prev_all_df['views'].fillna(0, inplace=True)
prev_all_df.fillna('Divers', inplace=True)
prev_all_df.head()[['author', 'date', 'title', 'views']]

In [None]:
all_df = pd.read_csv(os.path.join('..', 'mundus', 'data', f'mundus_article_2017_{year}.csv'), encoding=encoding, sep=';')
all_df['date'] = pd.to_datetime(all_df['date'])
all_df.sort_values('views', ascending=False, inplace=True)
all_df['views'].fillna(0, inplace=True)
all_df.fillna('Divers', inplace=True)
all_df.head()[['author', 'date', 'title', 'views']]

In [None]:
all_merge = pd.merge(all_df[['author', 'date', 'title', 'views']],
                     prev_all_df[['author', 'date', 'title', 'views']],
                     on=['author', 'date', 'title'])
all_merge['delta'] = all_merge['views_x'] - all_merge['views_y']
all_merge.head()

In [None]:
all_merge.sort_values('delta', ascending=False).head(10)