In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.metrics import r2_score

In [2]:
model_data_no_journal = pd.read_parquet("data/model_data_no_journal.parquet")
model_data_no_journal = model_data_no_journal[model_data_no_journal['political_alignment'] != 'autre']
model_data_no_journal['next_pres_votes_share'] = model_data_no_journal['pres_dummy'] * model_data_no_journal['pres_votes_share']
model_data_no_journal["election_year"] = model_data_no_journal["last_pres"].dt.year
model_data_no_journal["decade"] = (model_data_no_journal["month"].dt.year // 10) * 10
model_data_no_journal['europ_votes_share_dum'] = (model_data_no_journal['europ_votes_share'] * model_data_no_journal['europ_dummy']).fillna(0)

model_data = pd.read_parquet("data/model_data.parquet")
model_data = model_data[model_data['political_alignment'] != 'autre']
model_data['next_pres_votes_share'] = model_data['pres_dummy'] * model_data['pres_votes_share']
model_data["election_year"] = model_data["last_election"].dt.year
model_data["decade"] = (model_data["month"].dt.year // 10) * 10
model_data['europ_votes_share_dum'] = (model_data['europ_votes_share'] * model_data_no_journal['europ_dummy']).fillna(0)

In [3]:
outcome = "quotes_nb"

# Tentative de modélisation directe du nombre de citations

In [4]:
model = smf.ols(f"{outcome} ~ C(election_year) + journal + political_alignment + na_share + leg_votes_share + pres_votes_share + government + europ_votes_share_dum", 
                data=model_data).fit(cov_type='HC3')
y = model_data[outcome]
y_pred = model.predict(model_data)

r2 = r2_score(y, y_pred)
print(f"R2: {100*r2:.2f}%")
print("")

params = model.params.rename("coef").to_frame()
pvalues = model.pvalues.rename("pval").to_frame()
results = pd.merge(params, pvalues, left_index=True, right_index=True)
results = results[results['pval'] <= 0.05]
results.style

R2: 58.32%



Unnamed: 0,coef,pval
Intercept,-477.079332,0.0
C(election_year)[T.1988],368.949823,0.0
C(election_year)[T.1995],811.079678,0.0
C(election_year)[T.2002],914.501359,0.0
C(election_year)[T.2007],1176.823893,0.0
C(election_year)[T.2012],1161.823409,0.0
C(election_year)[T.2017],779.721478,0.0
C(election_year)[T.2022],1062.477937,0.0
journal[T.Le Figaro],1112.743415,0.0
journal[T.Le Monde],1424.119905,0.0


On se heurte à la difficulté prévue : la tendance étant forte et indépendante des régresseurs, les effets fixes capturent l'essentiel de la dynamique. Les coefficients des régresseurs sont à la limite de la significativité, et pas forcément interprétables...

In [5]:
model = smf.ols(f"{outcome} ~ C(election_year) * political_alignment + na_share + journal + leg_votes_share + pres_votes_share + government - 1", 
                data=model_data).fit(cov_type='HC3')
y = model_data[outcome]
y_pred = model.predict(model_data)

r2 = r2_score(y, y_pred)
print(f"R2: {100*r2:.2f}%")
print("")

params = model.params.rename("coef").to_frame()
pvalues = model.pvalues.rename("pval").to_frame()
results = pd.merge(params, pvalues, left_index=True, right_index=True)
results = results[results['pval'] <= 0.05]
results.style

R2: 58.58%



Unnamed: 0,coef,pval
C(election_year)[2002],405.03227,0.007677
C(election_year)[2007],948.3349,0.0
C(election_year)[2012],583.289292,0.0
journal[T.Le Figaro],1113.130305,0.0
journal[T.Le Monde],1422.36205,0.0
journal[T.Libération],919.115584,0.0
journal[T.Médiapart],-235.213406,0.0
leg_votes_share,-1179.657423,0.002709
government,337.241193,0.014448


L'ajout de termes d'interaction, qui permettraient de refléter une éventuelle normalisation de l'extrême droite, ne modifie pas les résultats.