In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from functools import reduce

## 1. Outcomes

In [2]:
mentions_quotes = pd.read_parquet("articles_quotes_mentions_med_1981-2025.parquet")

In [3]:
# Translate political nuances
def map_political_alignment(df, col='political_alignment'):
    mapping = {
        'centredroite_droite': 'Right',
        'centregauche_gauche': 'Left',
        'extremegauche_gaucheradicale': 'Far left',
        'centre': 'Center',
        'extremedroite_droiteradicale': 'Far right',
        'autre': 'Other'
    }
    df[col] = df[col].replace(mapping)
    return df

# Select alignments
valid_alignments = ['Far right', 'Right', 'Center', 'Left', 'Far left', 'Other']

# Rename journals
mentions_quotes['journal'] = mentions_quotes['journal'].replace('Mediapart', 'Médiapart')

# Convert date to datetime if it isn't already
if not pd.api.types.is_datetime64_any_dtype(mentions_quotes['date']):
    mentions_quotes['date'] = pd.to_datetime(mentions_quotes['date'])

# Extract month from date
mentions_quotes['month'] = mentions_quotes['date'].dt.to_period('M')

Les données ne portent pas sur les mentions mais sur les personnes mentionnées ou citées.
1. Comment les personnes mentionnées ou citées se répartissent-elles entre les nuances politiques ?
2. Quelle est la probabilité, pour les personnes d'une nuance politique, d'être citées dans un article = proportion des articles citant des personnes de chaque nuance (plus de compositionnalité car toutes les nuances peuvent être citées / mentionnées dans chaque article)
3. Proportion des articles citant au moins 1 fois chaque nuance politique

Pour 1., il faut construire 3 bases :
- Des personnes mentionnées chaque mois
- Des personnes citées chaque mois
- Des personnes mentionnées ou citées chaque mois => avec deux indicatrices, pour les mentions et les citations

## Citations

In [4]:
# Build a dataset where rows correspond to quoted persons
## Create a df with one row for each item in the 'quoted_people' dictionnary
single_quotes = mentions_quotes.explode('quoted_people')

## Create another df with one column for each key in the 'quoted_people' dictionnary
quotes_data = pd.json_normalize(single_quotes['quoted_people'])
quotes_data = quotes_data.reset_index(drop=True)
quotes_data = quotes_data[['Alignement', 'FirstLastName']]
quotes_data.rename(columns={'Alignement': 'political_alignment'}, inplace=True)
quotes_data = map_political_alignment(quotes_data)

## Merge the datasets
single_quotes = single_quotes[['journal', 'month']].reset_index(drop=True)
quotes = pd.concat([single_quotes, quotes_data], axis=1)

## Select relevant columns and rows
quotes = quotes[quotes['political_alignment'].isin(valid_alignments)]
quotes = quotes.dropna(subset=['FirstLastName'])  # Remove article without quotes

## Count the number of quotations for each person and remove duplicated rows
quotes_counts = (
    quotes.groupby(['month', 'journal', 'FirstLastName'])
    .size()
    .reset_index(name='quotes'))
quotes = quotes.drop_duplicates()
quotes = quotes.merge(quotes_counts, on=['journal', 'month', 'FirstLastName'])

## Mentions

In [5]:
# Build a dataset where rows correspond to mentionned persons
## Create a df with one row for each item in the 'mentionned_people' dictionnary
single_mentions = mentions_quotes.explode('mentioned_people')

## Create another df with one column for each key in the 'quoted_people' dictionnary
mentions_data = pd.json_normalize(single_mentions['mentioned_people'])
mentions_data = mentions_data.reset_index(drop=True)
mentions_data = mentions_data[['Alignement', 'FirstLastName']]
mentions_data.rename(columns={'Alignement': 'political_alignment'}, inplace=True)
mentions_data = map_political_alignment(mentions_data)

## Merge the datasets
single_mentions = single_mentions[['journal', 'month']].reset_index(drop=True)
mentions = pd.concat([single_mentions, mentions_data], axis=1)

## Select relevant columns and rows
mentions = mentions[mentions['political_alignment'].isin(valid_alignments)]
mentions = mentions.dropna(subset=['FirstLastName'])  # Remove article without mentions

## Count the number of quotations for each person and remove duplicated rows
mentions_counts = (
    mentions.groupby(['month', 'journal', 'FirstLastName'])
    .size()
    .reset_index(name='mentions'))
mentions = mentions.drop_duplicates()
mentions = mentions.merge(mentions_counts, on=['journal', 'month', 'FirstLastName'])

## Quotes and mentions

In [6]:
evocations = pd.merge(quotes, mentions, on=['month', 'journal', 'political_alignment', 'FirstLastName'])

In [7]:
stats = evocations.groupby(['political_alignment'])[['quotes', 'mentions']].mean()
stats['ratio'] = stats['mentions'] / stats['quotes']
stats.style.format({col: "{:.3f}" for col in stats.columns})

Unnamed: 0_level_0,quotes,mentions,ratio
political_alignment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Center,6.114,13.612,2.227
Far left,3.248,6.762,2.082
Far right,4.22,11.948,2.831
Left,3.356,8.227,2.451
Other,3.632,10.156,2.796
Right,3.57,9.269,2.596
