In [None]:
import h5py
import pandas as pd
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

from datetime import datetime
from pandas.tseries.offsets import MonthEnd

# Données

In [None]:
quotes = pd.read_hdf("data/quotes.h5")

In [None]:
ed_quotes = quotes[quotes['alignement_politique'] == 'extremegauche_gaucheradicale'].copy()

In [None]:
presi_dates = [
    "21/04/2002", "5/05/2002",
    "22/04/2007", "6/05/2007",
    "22/04/2012", "6/05/2012",
    "21/04/2017", "7/05/2017",
    "10/04/2022", "24/04/2022"]
presi_months = pd.to_datetime(presi_dates, dayfirst=True).to_period('M').drop_duplicates()

legi_dates = [
    "09/06/2002", "16/06/2002",
    "10/06/2007", "17/06/2007",
    "10/06/2012", "17/06/2012",
    "11/06/2017", "18/06/2017",
    "12/06/2022", "19/06/2022"]
legi_months = pd.to_datetime(legi_dates, dayfirst=True).to_period('M').drop_duplicates()

regio_dates = [
    "15/03/1998",
    "21/03/2004", "28/03/2004",
    "14/03/2010", "21/03/2010",
    "06/12/2015", "13/12/2015",
    "20/06/2021", "27/06/2021"]
regio_months = pd.to_datetime(regio_dates, dayfirst=True).to_period('M').drop_duplicates()

canto_dates = [
    "15/03/1998", "22/03/1998",
    "11/03/2001", "18/03/2001",
    "21/03/2004", "28/03/2004",
    "9/03/2008", "16/03/2008",
    "20/03/2011", "27/03/2011"]
canto_months = pd.to_datetime(canto_dates, dayfirst=True).to_period('M').drop_duplicates()

dept_dates = [
    "22/03/2015", "29/03/2015",
    "20/06/2021", "27/06/2021"]
dept_months = pd.to_datetime(dept_dates, dayfirst=True).to_period('M').drop_duplicates()

muni_dates = [
    "11/03/2001", "18/03/2001",
    "09/03/2008", "16/03/2008",
    "23/03/2014", "30/03/2014",
    "15/03/2020", "28/06/2020"]
muni_months = pd.to_datetime(muni_dates, dayfirst=True).to_period('M').drop_duplicates()

europ_dates = [
    "13/06/1999",
    "13/06/2004",
    "7/06/2009",
    "25/05/2014",
    "26/05/2019"]
europ_months = pd.to_datetime(europ_dates, dayfirst=True).to_period('M').drop_duplicates()

# Nombre et proportion d'articles citant l'ED par mois

## ED seule + élections

In [None]:
ed_monthly = ed_quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
ed_monthly.columns = ['Mois', 'Articles']
all_monthly = quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
all_monthly.columns = ['Mois', 'Articles']
plot_data = pd.merge(all_monthly, ed_monthly, on='Mois', how='left').fillna(0)
plot_data['prop_ed'] = plot_data['Articles_y'] / plot_data['Articles_x']

fig, ax1 = plt.subplots(figsize=(16, 6))

# Effectifs
ax1.plot(plot_data['Mois'], plot_data['Articles_y'], color="steelblue")
ax1.set_ylabel("Nombre d'articles citant l'ED", color="steelblue")
ax1.tick_params(axis='y', labelcolor="steelblue")

# Proportions
ax2 = ax1.twinx()
ax2.plot(plot_data['Mois'], plot_data['prop_ed'], color="deepskyblue")
ax2.set_ylabel("Proportion d'articles citant l'ED", color="deepskyblue")
ax2.tick_params(axis='y', labelcolor="deepskyblue")
ax2.set_ylim(0, 1.1)

# Périodes électorales
for period in presi_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='limegreen', alpha=0.3)
presi_months_patch = mpatches.Patch(color='limegreen', alpha=0.3, label='Présidentielles')

for period in legi_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='salmon', alpha=0.3)
legi_months_patch = mpatches.Patch(color='salmon', alpha=0.6, label='Législatives')

for period in regio_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='turquoise', alpha=0.3)
regio_months_patch = mpatches.Patch(color='turquoise', alpha=0.6, label='Régionales')

for period in canto_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='coral', alpha=0.3)
canto_months_patch = mpatches.Patch(color='coral', alpha=0.6, label='Cantonales')

for period in dept_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='coral', alpha=0.3)
dept_months_patch = mpatches.Patch(color='coral', alpha=0.6, label='Départementales')

for period in muni_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='cornflowerblue', alpha=0.3)
muni_months_patch = mpatches.Patch(color='cornflowerblue', alpha=0.6, label='Municipales')

for period in europ_months:
    start = period.to_timestamp()
    end = (start + MonthEnd(1))
    plt.axvspan(start, end, color='orchid', alpha=0.3)
europ_months_patch = mpatches.Patch(color='orchid', alpha=0.6, label='Européennes')

plt.legend(handles=[legi_months_patch, presi_months_patch, regio_months_patch, canto_months_patch, dept_months_patch, muni_months_patch, europ_months_patch])

plt.title("Nombre et proportion d'articles citant l'ED par mois")
plt.tight_layout()
plt.show()

## ED en comparaison des autres orientations politiques

In [None]:
political_alignments = [
    'extremegauche_gaucheradicale',
    'independent',
    'regionaliste',
    'extremedroite_droiteradicale']
plt.figure(figsize=(16, 6))

for alignment in political_alignments:
    subset_quotes = quotes[quotes['alignement_politique'] == alignment]
    monthly_counts = subset_quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
    monthly_counts.columns = ['Mois', 'Articles']
    plt.plot(monthly_counts['Mois'], monthly_counts['Articles'], label=f"{alignment}")

plt.legend()
plt.title("Nombre d'articles par mois pour différentes orientations politiques")
plt.xlabel("Mois")
plt.ylabel("Nombre d'articles")
plt.tight_layout()
plt.show()

In [None]:
political_alignments = [
    'centredroite_droite', 
    'centregauche_gauche',
    'centre']
plt.figure(figsize=(16, 6))

for alignment in political_alignments:
    subset_quotes = quotes[quotes['alignement_politique'] == alignment]
    monthly_counts = subset_quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
    monthly_counts.columns = ['Mois', 'Articles']
    plt.plot(monthly_counts['Mois'], monthly_counts['Articles'], label=f"{alignment}")

plt.legend()
plt.title("Nombre d'articles par mois pour différentes orientations politiques")
plt.xlabel("Mois")
plt.ylabel("Nombre d'articles")
plt.tight_layout()
plt.show()

## ED en fonction des journaux

In [None]:
journals = quotes['journal'].unique()
plt.figure(figsize=(16, 6))

for journal in journals:
    subset_quotes = quotes[quotes['journal'] == journal]
    monthly_counts = subset_quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
    monthly_counts.columns = ['Mois', 'Articles']
    plt.plot(monthly_counts['Mois'], monthly_counts['Articles'], label=f"{journal}")

plt.legend()
plt.title("Nombre mensuel d'articles citant des personnalités d'ED pour différents journaux")
plt.xlabel("")
plt.ylabel("")
plt.tight_layout()
plt.show()

# Nombre de citations par article

In [None]:
quotes_per_article_monthly = ed_quotes.groupby(pd.Grouper(key='date', freq='ME'))['titre'].nunique().reset_index()
quotes_per_article_monthly.columns = ['Mois', 'Nb cit']
quotes_per_article_monthly['12_month_MA'] = quotes_per_article_monthly['Nb cit'].rolling(window=12).mean()

plt.figure(figsize=(16, 6))
plt.plot(quotes_per_article_monthly['Mois'], quotes_per_article_monthly['Nb cit'], marker='', linestyle='-', color='steelblue', label='Moyenne mensuelle')
plt.plot(quotes_per_article_monthly['Mois'], quotes_per_article_monthly['12_month_MA'], marker='', linestyle='-', color='deepskyblue', label='Moyenne mobile annuelle')
plt.xlabel('')
plt.ylabel('')
plt.title("Nombre de citations par article citant une personne d'ED")
plt.legend()
plt.tight_layout()
plt.show()

# Longueur des citations

In [None]:
ed_quotes.loc[:, 'quote_length'] = ed_quotes['quotes'].apply(lambda x: len(x['Quote'][0].strip('«  »')))
quote_length_monthly = ed_quotes.groupby(pd.Grouper(key='date', freq='ME'))['quote_length'].nunique().reset_index()
quote_length_monthly.columns = ['Mois', 'Longueur des citations']
quote_length_monthly['12_month_MA'] = quote_length_monthly['Longueur des citations'].rolling(window=12).mean()

plt.figure(figsize=(16, 6))
plt.plot(quote_length_monthly['Mois'], quote_length_monthly['Longueur des citations'], marker='', linestyle='-', color='steelblue', label='Moyenne mensuelle')
plt.plot(quote_length_monthly['Mois'], quote_length_monthly['12_month_MA'], marker='', linestyle='-', color='deepskyblue', label='Moyenne mobile annuelle')
plt.xlabel('Mois')
plt.ylabel('Longueur des citations')
plt.title("Longueur moyenne des citations des personnalités d'ED, par mois")
plt.legend()
plt.tight_layout()
plt.show()

# Style des citations

In [None]:
ed_quotes.loc[:, 'quote_style'] = ed_quotes['quotes'].apply(lambda x: str(x['Quote'][3]))
ed_style_monthly = ed_quotes.groupby([pd.Grouper(key='date', freq='ME'), 'quote_style']).size().reset_index(name='count')
ed_style_monthly_pivot = ed_style_monthly.pivot(index='date', columns='quote_style', values='count').fillna(0)
ed_style_monthly_pivot = ed_style_monthly_pivot.div(ed_style_monthly_pivot.sum(axis=1), axis=0)
ed_style_monthly_pivot_ma = ed_style_monthly_pivot.rolling(window=12, min_periods=1).mean()


plt.figure(figsize=(16, 6))

for style in ed_style_monthly_pivot.columns:
    plt.plot(ed_style_monthly_pivot.index, ed_style_monthly_pivot[style], label=f'{style} (raw)', alpha=0.6)
    plt.plot(ed_style_monthly_pivot_ma.index, ed_style_monthly_pivot_ma[style], label=f'{style} (12-month MA)', linestyle='--')

plt.legend(title="Style de citation")
plt.title("Part des types de citation de personnalités de l'ED par mois avec moyenne mobile sur 12 mois")
plt.tight_layout()
plt.show()

# Journalistes et personnalités

In [None]:
ed_pers_monthly = ed_quotes.groupby(pd.Grouper(key='date', freq='ME'))['name_surname'].nunique().reset_index()
ed_pers_monthly.columns = ['Mois', 'Personnalités']

ed_journ_monthly = (
    ed_quotes.explode('list_authors')
    .groupby(pd.Grouper(key='date', freq='ME'))['list_authors']
    .nunique()
    .reset_index()
)
ed_journ_monthly.columns = ['Mois', 'Journalistes']
plot_data = pd.merge(ed_pers_monthly, ed_journ_monthly, on='Mois', how='left').fillna(0)

plt.figure(figsize=(16, 6))
plt.plot(plot_data['Mois'], plot_data['Personnalités'], color="steelblue", label="Nombre de personnalités de l'ED citées par mois")
plt.plot(plot_data['Mois'], plot_data['Journalistes'], color="deepskyblue", label="Nombre de journalistes citant des personnalités d'ED par mois")
plt.legend()
plt.title("Nombre de journalistes et de personnalités de l'ED apparaisant par mois")
plt.tight_layout()
plt.show()

# Cues
Beaucoup de valeurs manquantes, et pas l'impression d'évolutions notables au cours du temps ?

In [None]:
# Comparer selon les orientations politiques sans dimension temporelle

In [None]:
ed_clean = ed_quotes[
    ed_quotes['cue_lemma'].notna() & (ed_quotes['cue_lemma'] != 'nan')
].copy()
ed_clean['year'] = ed_clean['date'].dt.year

lemma_counts = (
    ed_clean
    .groupby(['year', 'cue_lemma'])
    .size()
    .reset_index(name='count')
)

top_lemmas_per_year = (
    lemma_counts
    .sort_values(['year', 'count'], ascending=[True, False])
    .groupby('year')
    .head(5)
).reset_index(drop=True)

top_lemmas_per_year.style.hide(axis="index")