In [None]:
import h5py
import pandas as pd
import statsmodels.api as sm

In [None]:
quotes = pd.read_hdf("data/quotes/quotes_pol_allpol1.h5")

In [None]:
elec = pd.read_parquet("data/elec/leg_shares.parquet")

## Step 1: compute monthly share of articles with quotes per political nuance

In [None]:
# Filter quotes to only include political alignments from elec dataframe
valid_alignments = elec['alignement_politique'].unique()
filtered_quotes = quotes[quotes['alignement_politique'].isin(valid_alignments)].copy()

# Convert date to datetime if it isn't already
if not pd.api.types.is_datetime64_any_dtype(filtered_quotes['date']):
    filtered_quotes['date'] = pd.to_datetime(filtered_quotes['date'])

# Extract month from date
filtered_quotes['month'] = filtered_quotes['date'].dt.to_period('M')

# Count text_ids by month and political alignment
counts = filtered_quotes.groupby(['month', 'alignement_politique'])['text_id'].nunique().reset_index()

# Get total counts per month
total_counts = filtered_quotes.groupby('month')['text_id'].nunique().reset_index()
total_counts.rename(columns={'text_id': 'total_text_id'}, inplace=True)

# Merge the counts with total counts
model_data = pd.merge(counts, total_counts, on='month')

# Calculate share and cleanup the dataframe
model_data['art_share'] = model_data['text_id'] / model_data['total_text_id']
model_data = model_data[['month', 'alignement_politique', 'art_share']].copy()

## Step 2: identify electoral cycles

In [None]:
legi_dates = [
    "21/06/1981",
    "16/03/1986",
    "11/06/1988",
    "28/03/1993",
    "01/06/1997",
    "16/06/2002",
    "17/06/2007",
    "17/06/2012",
    "18/06/2017",
    "19/06/2022",
    "06/07/2024"]

In [None]:
# Remove entries older than the earliest election in the dataset
legi_dates = pd.to_datetime(legi_dates, format="%d/%m/%Y").to_period('M').sort_values()
model_data = model_data[model_data['month'] >= min(legi_dates)].copy().reset_index()

In [None]:
# Find the year of the latest election
def find_last_election_year(current_month):
    relevant_elections = legi_dates[legi_dates <= current_month]

    if not relevant_elections.empty:
        return relevant_elections.max().year
    else:
        return pd.NA

model_data['last_election'] = model_data['month'].apply(find_last_election_year)
model_data['last_election'] = pd.to_datetime(model_data['last_election'], format="%Y")

## Step 3: merge with electoral data

In [None]:
elec = elec.groupby('alignement_politique', as_index=False).sum()
elec.drop(columns='candidat', inplace=True)
elec = elec.melt(id_vars=['alignement_politique'], var_name='last_election', value_name='votes_share')
elec['last_election'] = pd.to_datetime(elec['last_election'], format="%Y")

In [None]:
model_data = pd.merge(model_data, elec, on=['alignement_politique', 'last_election'], how='left')
model_data.drop(columns=['last_election', 'index'], inplace=True)

## Step 4: governement dummy
Comment traiter les cohabitations ? En l'état le bonus est seulement accordé à la nuance au gouvernement.

- **centre**
  - 19/06/2017 - 31/12/2024
- **centredroite_droite**
  - 20/03/1986 - 10/05/1988
  - 29/03/1993 - 02/06/1997
  - 06/05/2002 - 10/05/2012
- **centregauche_gauche**
  - 22/06/1981 - 20/03/1986
  - 10/05/1988 - 29/03/1993
  - 02/06/1997 - 06/05/2002
  - 15/05/2012 - 19/06/2017

In [None]:
valid_alignments

In [None]:
gov_periods = {
    "centre": [("2017-06-19", "2024-12-31")],
    "centredroite_droite": [
        ("1986-03-20", "1988-05-10"),
        ("1993-03-29", "1997-06-02"),
        ("2002-05-06", "2012-05-10"),
    ],
    "centregauche_gauche": [
        ("1981-06-22", "1986-03-20"),
        ("1988-05-10", "1993-03-29"),
        ("1997-06-02", "2002-05-06"),
        ("2012-05-15", "2017-06-19"),
    ],
    "extremedroite_droiteradicale": [],
    "extremegauche_gaucheradicale": [],
    "autre": [],
}

In [None]:
all_start_dates = [pd.to_datetime(start) for periods in gov_periods.values() for start, _ in periods]
all_end_dates = [pd.to_datetime(end) for periods in gov_periods.values() for _, end in periods]
min_date = min(all_start_dates).replace(day=1)
max_date = max(all_end_dates).replace(day=1)
all_months = pd.date_range(start=min_date, end=max_date, freq="MS")

In [None]:
data = []

for alignment, periods in gov_periods.items():
    active_months = pd.Series(0, index=all_months)
    for start, end in periods:
        period_range = pd.date_range(
            start=pd.to_datetime(start).replace(day=1),
            end=pd.to_datetime(end).replace(day=1),
            freq="MS"
        )
        active_months.loc[period_range] = 1

    df = pd.DataFrame({
        "month": active_months.index.to_period('M'),
        "alignement_politique": alignment,
        "government": active_months.values
    })
    data.append(df)

gov_dummies = pd.concat(data)
gov_dummies.sort_values(by=["month", "alignement_politique"], inplace=True)
gov_dummies.reset_index(drop=True, inplace=True)

In [None]:
model_data = pd.merge(model_data, gov_dummies, on=['alignement_politique', 'month'], how='left')

## Step 5: regression

In [None]:
results = []

for alignement in model_data["alignement_politique"].unique():
    subset = model_data[model_data["alignement_politique"] == alignement]
    
    X = subset[["votes_share", "government"]]
    X = sm.add_constant(X)
    y = subset["art_share"]
    
    model = sm.OLS(y, X).fit()
    
    results.append({
        "alignement_politique": alignement,
        "R_squared": model.rsquared
    })

r2_table = pd.DataFrame(results).sort_values("R_squared", ascending=False)

In [None]:
r2_table.style.hide(axis="index")