In [None]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import re
import requests
import s3fs
import shutil
from tqdm import tqdm
import zipfile

In [None]:
def extract_strings_from_webpage(url):
    response = requests.get(url) 
    if response.status_code == 200:
        strings = re.findall(r'"([^"]*)"', response.text)
        return strings
    else:
        print(f'Failed to fetch the webpage. Status code: {response.status_code}')
        return []

webpage_url = 'https://unehistoireduconflitpolitique.fr/telecharger.html'
extracted_strings = extract_strings_from_webpage(webpage_url)
download_links = [item for item in extracted_strings if item.endswith('dta.zip') or item.endswith('dta.zip')]
print(f'Identified {len(download_links)} files to download.')

In [None]:
filtered_links = [
    link for link in download_links 
    if ('pres' in link.lower() or 'leg' in link.lower()) 
    and any(int(match.group(1)) >= 1988 
           for match in re.finditer(r'(?:pres|leg)(\d{4})', link.lower()))
]
print(f'Extracted {len(filtered_links)} relevant files.')

In [None]:
os.makedirs('data/zip', exist_ok=True)

progress_bar = tqdm(total=len(filtered_links), desc='Downloading', unit='file')

for link in filtered_links:
    try:
        file_name = os.path.join('data/zip', os.path.basename(link))
        response = requests.get(link)
        with open(file_name, 'wb') as file:
            file.write(response.content)
        progress_bar.update(1)
    except Exception as e:
        print(f'Error downloading {link}: {e}')
        
progress_bar.close()

In [None]:
os.makedirs('data/dta', exist_ok=True)

zip_files = [file for file in os.listdir('data/zip') if file.endswith('.zip')]

progress_bar = tqdm(total=len(zip_files), desc="Extracting", unit="file")

for file in zip_files:
    try:
        zip_file_path = os.path.join('data/zip', file)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            for member in zip_ref.infolist():
                # Skip macOS metadata files (those starting with ._)
                basename = os.path.basename(member.filename)
                if basename.startswith('._'):
                    continue
                    
                if member.filename.lower().endswith('.dta'):
                    target_path = os.path.join('data/dta', basename)
                    with zip_ref.open(member) as source, open(target_path, 'wb') as dest:
                        shutil.copyfileobj(source, dest)
        
        progress_bar.update(1)
    except Exception as e:
        print(f'Error extracting {file}: {e}')

progress_bar.close()

In [None]:
for prefix in ["leg", "pres"]:
    files = glob.glob(f"data/dta/{prefix}????comm.dta")

    # First pass: collect all voix columns across all files
    all_voix_cols = set()

    for file in files:
        df = pd.read_stata(file)
        voix_cols = [col for col in df.columns if col.startswith('voix')]
        all_voix_cols.update(voix_cols)

    all_voix_cols = list(all_voix_cols)

    # Storage for the raw data
    all_aggregated = []

    # Step 2: compute sums and proportions
    for file in files:
        df = pd.read_stata(file)

        match = re.search(rf'{prefix}(\d{{4}})comm\.dta', os.path.basename(file))
        year = int(match.group(1)) if match else None

        for col in all_voix_cols:
            if col not in df.columns:
                df[col] = 0

        sums = df[all_voix_cols + ['inscrits']].sum()
        tot_inscrits = sums['inscrits']

        for col in all_voix_cols:
            all_aggregated.append({
                'variable': col,
                year: sums[col]
            })
            all_aggregated.append({
                'variable': f"p_{col}",
                year: sums[col] / tot_inscrits if tot_inscrits != 0 else 0
            })

    leg_all_raw = pd.DataFrame(all_aggregated)

    leg_all = leg_all_raw.groupby('variable').first().reset_index()
    year_columns = sorted([col for col in leg_all.columns if col != 'variable'])
    leg_all = leg_all[['variable'] + year_columns]

    # Save to parquet
    leg_all.columns = ['variable'] + [str(col) for col in year_columns]
    leg_all.to_parquet(f"data/{prefix}_all.parquet")