In [None]:
import numpy as np
import os
import pandas as pd
import re
import requests
import shutil
import zipfile
import glob

from tqdm import tqdm

# Présidentielles et législatives
D'après les données collationnées par Piketty et Cagé pour leur ouvrage *[Une histoire du conflit politique](https://unehistoireduconflitpolitique.fr/)*.

In [None]:
def extract_strings_from_webpage(url):
    response = requests.get(url) 
    if response.status_code == 200:
        strings = re.findall(r'"([^"]*)"', response.text)
        return strings
    else:
        print(f'Failed to fetch the webpage. Status code: {response.status_code}')
        return []

webpage_url = 'https://unehistoireduconflitpolitique.fr/telecharger.html'
extracted_strings = extract_strings_from_webpage(webpage_url)
download_links = [item for item in extracted_strings if item.endswith('dta.zip') or item.endswith('dta.zip')]
print(f'Identified {len(download_links)} files to download.')

In [None]:
filtered_links = [
    link for link in download_links 
    if ('pres' in link.lower() or 'leg' in link.lower()) 
    and any(int(match.group(1)) >= 1988 
           for match in re.finditer(r'(?:pres|leg)(\d{4})', link.lower()))
]
print(f'Extracted {len(filtered_links)} relevant files.')

In [None]:
os.makedirs('data/zip', exist_ok=True)

progress_bar = tqdm(total=len(filtered_links), desc='Downloading', unit='file')

for link in filtered_links:
    try:
        file_name = os.path.join('data/zip', os.path.basename(link))
        response = requests.get(link)
        with open(file_name, 'wb') as file:
            file.write(response.content)
        progress_bar.update(1)
    except Exception as e:
        print(f'Error downloading {link}: {e}')
        
progress_bar.close()

In [None]:
os.makedirs('data/dta', exist_ok=True)

zip_files = [file for file in os.listdir('data/zip') if file.endswith('.zip')]

progress_bar = tqdm(total=len(zip_files), desc="Extracting", unit="file")

for file in zip_files:
    try:
        zip_file_path = os.path.join('data/zip', file)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            for member in zip_ref.infolist():
                # Skip macOS metadata files (those starting with ._)
                basename = os.path.basename(member.filename)
                if basename.startswith('._'):
                    continue
                    
                if member.filename.lower().endswith('.dta'):
                    target_path = os.path.join('data/dta', basename)
                    with zip_ref.open(member) as source, open(target_path, 'wb') as dest:
                        shutil.copyfileobj(source, dest)
        
        progress_bar.update(1)
    except Exception as e:
        print(f'Error extracting {file}: {e}')

progress_bar.close()

In [None]:
os.makedirs('data/elec', exist_ok=True)

for prefix in ["leg", "pres"]:
    files = glob.glob(f"data/dta/{prefix}????comm.dta")

    # Step 1: collect all voix columns across all files
    all_candidates = set()
    for file in files:
        df = pd.read_stata(file)
        candidates = [col for col in df.columns if col.startswith('voix')]
        all_candidates.update(candidates)
    all_candidates = list(all_candidates)

    # Step 2: compute sums and proportions
    counts_records = []
    shares_records = []
    
    for file in files:
        df = pd.read_stata(file)
        match = re.search(rf'{prefix}(\d{{4}})comm\.dta', os.path.basename(file))
        year = int(match.group(1)) if match else None

        for col in all_candidates:
            if col not in df.columns:
                df[col] = np.nan

        sums = df[all_candidates + ['inscrits']].sum(min_count=1)
        tot_inscrits = sums['inscrits']

        for col in all_candidates:
            candidate_name = col.replace('voix', '')
            counts_records.append({
                'candidat': candidate_name,
                str(year): sums[col]
            })
            shares_records.append({
                'candidat': candidate_name,
                str(year): sums[col] / tot_inscrits if tot_inscrits != 0 else 0
            })

    # Step 3: build dataframes for counts and shares
    df_counts_raw = pd.DataFrame(counts_records)
    df_counts = df_counts_raw.groupby('candidat').first().reset_index()

    df_shares_raw = pd.DataFrame(shares_records)
    df_shares = df_shares_raw.groupby('candidat').first().reset_index()

    # Reorder columns
    year_columns = sorted([col for col in df_counts.columns if col != 'candidat'])
    df_counts = df_counts[['candidat'] + year_columns]
    df_shares = df_shares[['candidat'] + year_columns]

    # Step 4: handle round info for presidential elections
    if prefix == "pres":
        for df in [df_counts, df_shares]:
            df['tour'] = np.where(df['candidat'].str.contains('T2'), 2, 1)
            df['candidat'] = df['candidat'].str.replace('T2', '', regex=False)

    # Step 5: save to parquet
    df_counts.to_parquet(f"data/elec/{prefix}_counts.parquet", index=False)
    df_shares.to_parquet(f"data/elec/{prefix}_shares.parquet", index=False)

# Européennes
Résultats téléchargés depuis [data.gouv.fr](https://www.data.gouv.fr/).

In [None]:
# 2019
https://static.data.gouv.fr/resources/resultats-des-elections-europeennes-2019/20190531-144431/resultats-definitifs-par-region.xls
# 2014
https://www.data.gouv.fr/storage/f/2014-05-30T10-34-25/euro-2014-resultats-c.xlsx
# 2009
https://static.data.gouv.fr/e0/88c770f067e9bc4cfd3dea656aa69b15add243c1c4c367a85d7314e74fde63.xls
# 2004
https://static.data.gouv.fr/82/b8b9cde91b95802dc0092d4a76c10dbf5b0b0fae8e4ccafc6ffb33397e8053.xls
# 1999
https://static.data.gouv.fr/fa/d907ec8071c5f153de8235efa65df560bb269425132521c079657794a0c62a.xls