In [1]:
import h5py
import pandas as pd

from functools import reduce

In [2]:
quotes = pd.read_hdf("data/quotes/quotes_pol_allpol1.h5")
quotes.rename(columns={'alignement_politique': 'political_alignment'}, inplace=True)

elec_leg = pd.read_parquet("data/elec/leg_shares.parquet")
elec_pres = pd.read_parquet("data/elec/pres_shares.parquet")
na_seats = pd.read_csv("data/sieges_AN.csv")

## 1. Outcomes

In [3]:
# Filter quotes to only include political alignments from elec_leg dataframe
valid_alignments = elec_leg['political_alignment'].unique()
filtered_quotes = quotes[quotes['political_alignment'].isin(valid_alignments)].copy()

# Convert date to datetime if it isn't already
if not pd.api.types.is_datetime64_any_dtype(filtered_quotes['date']):
    filtered_quotes['date'] = pd.to_datetime(filtered_quotes['date'])

# Extract month from date
filtered_quotes['month'] = filtered_quotes['date'].dt.to_period('M')

In [4]:
# Number and share of quotes
quotes_counts = filtered_quotes.groupby(['month', 'political_alignment']).size().reset_index()
quotes_total_counts = filtered_quotes.groupby(['month']).size().reset_index(name='quotes_nb')
quotes_data = pd.merge(quotes_counts, quotes_total_counts, on=['month'])
quotes_data['quotes_share'] = quotes_data[0] / quotes_data['quotes_nb']
quotes_data.drop(columns=0, inplace=True)

In [5]:
# Number and share of articles with quotes
art_counts = filtered_quotes.groupby(['month', 'political_alignment'])['text_id'].nunique().reset_index()
art_total_counts = filtered_quotes.groupby(['month'])['text_id'].nunique().reset_index(name='art_nb')
art_data = pd.merge(art_counts, art_total_counts, on=['month'])
art_data['art_share'] = art_data['text_id'] / art_data['art_nb']
art_data.drop(columns='text_id', inplace=True)

In [6]:
# Quote length
filtered_quotes['quote_length'] = filtered_quotes['quote_text'].apply(lambda x: len(x.strip('« » ')))
length_data = filtered_quotes.groupby(['month', 'political_alignment'])['quote_length'].mean().reset_index(name='mean_quote_length')

In [7]:
# Quote style
type_counts = filtered_quotes.groupby(['month', 'political_alignment', 'quote_type']).size().reset_index()
type_total_counts = filtered_quotes.groupby(['month', 'political_alignment']).size().reset_index(name='quotes_type_nb')
type_data = pd.merge(type_counts, type_total_counts, on=['month', 'political_alignment'])
type_data['type_quotes_share'] = type_data[0] / type_data['quotes_type_nb']

direct_data = (
    type_data[type_data['quote_type'] == 'Direct']
    .rename(columns={'type_quotes_share': 'type_direct_share'})
    .drop(columns=['quote_type', 'quotes_type_nb', 0]))

undirect_data = (
    type_data[type_data['quote_type'] == 'Indirect']
    .rename(columns={'type_quotes_share': 'type_undirect_share'})
    .drop(columns=['quote_type', 'quotes_type_nb', 0]))

mixed_data = (
    type_data[type_data['quote_type'] == 'Mixed']
    .rename(columns={'type_quotes_share': 'type_mixed_share'})
    .drop(columns=['quote_type', 'quotes_type_nb', 0]))

In [8]:
# Personalities
person_data = filtered_quotes.groupby(['month', 'political_alignment'])['person_name_standardized'].nunique().reset_index(name='personalities_nb')

In [9]:
# Final dataframe
dfs = [quotes_data, art_data, length_data, direct_data, undirect_data, mixed_data, person_data]
model_data = reduce(lambda left, right: pd.merge(left, right, on=['month', 'political_alignment']), dfs)

## 2. Identification des cycles électoraux et fusion avec les résultats pour les législatives

In [10]:
legi_dates = [
    "21/06/1981",
    "16/03/1986",
    "11/06/1988",
    "28/03/1993",
    "01/06/1997",
    "16/06/2002",
    "17/06/2007",
    "17/06/2012",
    "18/06/2017",
    "19/06/2022",
    "06/07/2024"]

In [11]:
# Remove entries older than the earliest election in the dataset
legi_dates = pd.to_datetime(legi_dates, format="%d/%m/%Y").to_period('M').sort_values()
model_data = model_data[model_data['month'] >= min(legi_dates)].copy().reset_index()

In [12]:
# Find the year of the latest election
def find_last_legi_year(current_month):
    relevant_elections = legi_dates[legi_dates <= current_month]

    if not relevant_elections.empty:
        return relevant_elections.max().year
    else:
        return pd.NA

model_data['last_election'] = model_data['month'].apply(find_last_legi_year)
model_data['last_election'] = pd.to_datetime(model_data['last_election'], format="%Y")

In [13]:
elec_leg = elec_leg.groupby('political_alignment', as_index=False).sum()
elec_leg.drop(columns='candidat', inplace=True)
elec_leg = elec_leg.melt(id_vars=['political_alignment'], var_name='last_election', value_name='leg_votes_share')
elec_leg['last_election'] = pd.to_datetime(elec_leg['last_election'], format="%Y")

In [14]:
na_seats = na_seats.groupby('nuance', as_index=False).sum()
na_seats.drop(columns='parti', inplace=True)
na_shares = na_seats.copy()
year_columns = na_shares.columns.drop('nuance')
na_shares[year_columns] = na_shares[year_columns].div(na_shares[year_columns].sum(axis=0), axis=1)
na_shares = na_shares.melt(id_vars=['nuance'], var_name='last_election', value_name='na_share')
na_shares.rename(columns={'nuance': 'political_alignment'}, inplace=True)
na_shares['last_election'] = pd.to_datetime(na_shares['last_election'], format="%Y")

In [15]:
model_data = pd.merge(model_data, elec_leg, on=['political_alignment', 'last_election'], how='left')
model_data = pd.merge(model_data, na_shares, on=['political_alignment', 'last_election'], how='left')
model_data.drop(columns=['last_election', 'index'], inplace=True)

## 3. Identification des cycles électoraux et fusion avec les résultats pour les présidentielles

In [16]:
presi_dates = [
    "26/04/1981",
    "24/04/1988",
    "23/04/1995",
    "21/04/2002",
    "22/04/2007",
    "22/04/2012",
    "21/04/2017",
    "10/04/2022"]

In [17]:
presi_dates = pd.to_datetime(presi_dates, dayfirst=True).to_period('M').sort_values()

target_months = set()
for p in presi_dates:
    for i in range(4):
        target_months.add(p - i)

model_data['pres_dummy'] = model_data['month'].isin(target_months).astype(int)

In [18]:
def find_last_presi_year(current_month):
    relevant_elections = presi_dates[presi_dates <= current_month]

    if not relevant_elections.empty:
        return relevant_elections.max().year
    else:
        return pd.NA

model_data['last_election'] = model_data['month'].apply(find_last_presi_year)
model_data['last_election'] = pd.to_datetime(model_data['last_election'], format="%Y")

In [19]:
elec_pres = elec_pres[elec_pres['tour'] == 1]
elec_pres.drop(columns=['candidat', 'tour'], inplace=True)
elec_pres = elec_pres.groupby('political_alignment', as_index=False).sum()
elec_pres = elec_pres.melt(id_vars=['political_alignment'], var_name='last_election', value_name='pres_votes_share')
elec_pres['last_election'] = pd.to_datetime(elec_pres['last_election'], format="%Y")

In [20]:
model_data = pd.merge(model_data, elec_pres, on=['political_alignment', 'last_election'], how='left')
# model_data.drop(columns='last_election', inplace=True)

## 4. Indicatrice de représentation au gouvernement
Comment traiter les cohabitations ? En l'état le bonus est seulement accordé à la nuance au gouvernement.

- **centre**
  - 19/06/2017 - 05/09/2024
- **centredroite_droite**
  - 20/03/1986 - 10/05/1988
  - 29/03/1993 - 02/06/1997
  - 06/05/2002 - 10/05/2012
  - 05/09/2024 - 31/12/2024
- **centregauche_gauche**
  - 22/06/1981 - 20/03/1986
  - 10/05/1988 - 29/03/1993
  - 02/06/1997 - 06/05/2002
  - 15/05/2012 - 19/06/2017

In [21]:
gov_periods = {
    "centre": [
        ("2017-06-19", "2024-09-05"),
    ],
    "centredroite_droite": [
        ("1986-03-20", "1988-05-10"),
        ("1993-03-29", "1997-06-02"),
        ("2002-05-06", "2012-05-10"),
        ("2024-09-05", "2024-12-31"),
    ],
    "centregauche_gauche": [
        ("1981-06-22", "1986-03-20"),
        ("1988-05-10", "1993-03-29"),
        ("1997-06-02", "2002-05-06"),
        ("2012-05-15", "2017-06-19"),
    ],
    "extremedroite_droiteradicale": [],
    "extremegauche_gaucheradicale": [],
    "autre": [],
}

In [22]:
all_start_dates = [pd.to_datetime(start) for periods in gov_periods.values() for start, _ in periods]
all_end_dates = [pd.to_datetime(end) for periods in gov_periods.values() for _, end in periods]
min_date = min(all_start_dates).replace(day=1)
max_date = max(all_end_dates).replace(day=1)
all_months = pd.date_range(start=min_date, end=max_date, freq="MS")

data = []

for alignment, periods in gov_periods.items():
    active_months = pd.Series(0, index=all_months)
    for start, end in periods:
        period_range = pd.date_range(
            start=pd.to_datetime(start).replace(day=1),
            end=pd.to_datetime(end).replace(day=1),
            freq="MS"
        )
        active_months.loc[period_range] = 1

    df = pd.DataFrame({
        "month": active_months.index.to_period('M'),
        "political_alignment": alignment,
        "government": active_months.values
    })
    data.append(df)

gov_dummies = pd.concat(data)
gov_dummies.sort_values(by=["month", "political_alignment"], inplace=True)
gov_dummies.reset_index(drop=True, inplace=True)

In [23]:
model_data = pd.merge(model_data, gov_dummies, on=['political_alignment', 'month'], how='left')

model_data['political_alignment'] = model_data['political_alignment'].replace({
    'centredroite_droite': 'Right',
    'centregauche_gauche': 'Left',
    'extremegauche_gaucheradicale': 'Far left',
    'centre': 'Center',
    'extremedroite_droiteradicale': 'Far right'})

model_data.to_parquet(f"data/model_data_no_journal.parquet", index=False)