In [10]:
!pip install pandas pyreadr
import pandas as pd
import pyreadr
import re
import requests
import os
import shutil
import zipfile



In [3]:
# Identification des fichiers csv
def extract_strings_from_webpage(url):
    response = requests.get(url) 
    if response.status_code == 200:
        strings = re.findall(r'"([^"]*)"', response.text)
        return strings
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return []

webpage_url = "https://unehistoireduconflitpolitique.fr/telecharger.html"
extracted_strings = extract_strings_from_webpage(webpage_url)

download_links = [item for item in extracted_strings if item.endswith("csv.zip") or item.endswith("csp.zip")]

In [4]:
# Téléchargement des fichiers
os.makedirs('data_zip', exist_ok=True)

for link in download_links:
    try:
        file_name = os.path.join('data_zip', os.path.basename(link))
        response = requests.get(link)
        with open(file_name, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {file_name}")
    except Exception as e:
        print(f"Error downloading {link}: {e}")
        
print("Download completed")

Downloaded: data_zip/pres1848_csv.zip
Downloaded: data_zip/pres1965_csv.zip
Downloaded: data_zip/pres1969_csv.zip
Downloaded: data_zip/pres1974_csv.zip
Downloaded: data_zip/pres1981_csv.zip
Downloaded: data_zip/pres1988_csv.zip
Downloaded: data_zip/pres1995_csv.zip
Downloaded: data_zip/pres2002_csv.zip
Downloaded: data_zip/pres2007_csv.zip
Downloaded: data_zip/pres2012_csv.zip
Downloaded: data_zip/pres2017_csv.zip
Downloaded: data_zip/pres2022_csv.zip
Downloaded: data_zip/ref1793_csv.zip
Downloaded: data_zip/ref1795_csv.zip
Downloaded: data_zip/ref1946_csv.zip
Downloaded: data_zip/ref1992_csv.zip
Downloaded: data_zip/ref2005_csv.zip
Downloaded: data_zip/leg1848_csv.zip
Downloaded: data_zip/leg1849_csv.zip
Downloaded: data_zip/leg1871fev_csv.zip
Downloaded: data_zip/leg1871juil_csv.zip
Downloaded: data_zip/leg1876_csv.zip
Downloaded: data_zip/leg1881_csv.zip
Downloaded: data_zip/leg1885_csv.zip
Downloaded: data_zip/leg1889_csv.zip
Downloaded: data_zip/leg1893_csv.zip
Downloaded: data_zi

In [5]:
# Extraction des résultats électoraux
os.makedirs('data_csv/elections/pres', exist_ok=True)
os.makedirs('data_csv/elections/leg', exist_ok=True)
os.makedirs('data_csv/elections/ref', exist_ok=True)

files = os.listdir('data_zip')
for prefix in ['pres', 'leg', 'ref']:
    for file in files:
        if file.startswith(prefix) and file.endswith('.zip'):
            zip_file_path = os.path.join('data_zip', file)
            prefix_dir = os.path.join('data_csv/elections', prefix)
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                for member in zip_ref.infolist():
                    if member.filename.lower().endswith('.csv'):
                        target_path = os.path.join(prefix_dir, os.path.basename(member.filename))
                        with zip_ref.open(member) as source, open(target_path, 'wb') as dest:
                            shutil.copyfileobj(source, dest)
            print(f"Extracted CSV files from {file} to {prefix_dir}")
            
print("Extraction completed.")

Extracted CSV files from pres2017_csv.zip to data_csv/elections/pres
Extracted CSV files from pres2002_csv.zip to data_csv/elections/pres
Extracted CSV files from pres2022_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1969_csv.zip to data_csv/elections/pres
Extracted CSV files from pres2012_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1848_csv.zip to data_csv/elections/pres
Extracted CSV files from pres2007_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1995_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1965_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1974_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1981_csv.zip to data_csv/elections/pres
Extracted CSV files from pres1988_csv.zip to data_csv/elections/pres
Extracted CSV files from leg1981_csv.zip to data_csv/elections/leg
Extracted CSV files from leg1849_csv.zip to data_csv/elections/leg
Extracted CSV files from leg1936_csv.z

In [6]:
# Nettoyage des résultats électoraux
for root, dirs, files in os.walk('data_csv'):
    for file_name in files:
        if file_name.startswith("._"):
            file_path = os.path.join(root, file_name)
            try:
                os.remove(file_path)
                print(f"Deleted file: {file_path}")
            except Exception as e:
                print(f"Error deleting file {file_path}: {e}")

print("Deletion of extra files completed")

os.rename("data_csv/elections/pres", "data_csv/elections/presidentielles")
os.rename("data_csv/elections/leg", "data_csv/elections/legislatives")
os.rename("data_csv/elections/ref", "data_csv/elections/referendums")

print("Folder renaming completed")

Deleted file: data_csv/elections/leg/._leg1967comm.csv
Deleted file: data_csv/elections/leg/._leg1885comm.csv
Deleted file: data_csv/elections/leg/._leg1849comm.csv
Deleted file: data_csv/elections/leg/._leg1910comm.csv
Deleted file: data_csv/elections/leg/._leg1997comm.csv
Deleted file: data_csv/elections/leg/._leg1962comm.csv
Deleted file: data_csv/elections/leg/._leg2017comm.csv
Deleted file: data_csv/elections/leg/._leg1978comm.csv
Deleted file: data_csv/elections/leg/._leg1876comm.csv
Deleted file: data_csv/elections/leg/._leg1928comm.csv
Deleted file: data_csv/elections/leg/._leg1914comm.csv
Deleted file: data_csv/elections/leg/._leg1945comm.csv
Deleted file: data_csv/elections/leg/._leg2002comm.csv
Deleted file: data_csv/elections/leg/._leg1973comm.csv
Deleted file: data_csv/elections/leg/._leg1947comm.csv
Deleted file: data_csv/elections/leg/._leg1951comm.csv
Deleted file: data_csv/elections/leg/._leg1889comm.csv
Deleted file: data_csv/elections/leg/._leg1986comm.csv
Deleted fi

In [7]:
# Extraction des contrôles
os.makedirs('data_csv/controles', exist_ok=True)

zip_files = [f for f in os.listdir('data_zip') if f.endswith('.zip')]
for zip_file in zip_files:
    if zip_file.startswith(('pres', 'leg', 'ref')):
        continue
    zip_path = os.path.join('data_zip', zip_file)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            file_list = [file for file in zip_ref.namelist() if not file.startswith('__MACOSX')]
            zip_ref.extractall('data_csv/controles', members=file_list)    
            print(f"Files extracted from {zip_file}:")
            for extracted_file in file_list:
                print(extracted_file)
    except Exception as e:
        print(f"Error extracting {zip_file}: {e}")

print("Extraction complete.")

Files extracted from Nationalites_csv.zip:
Nationalites_csv/
Nationalites_csv/naticommunes.csv
Nationalites_csv/natidepartements.csv
Nationalites_csv/etrangerscommunes.csv
Files extracted from Age_csp.zip:
Age_csp/
Age_csp/agesexdepartements.csv
Age_csp/agesexcommunes.csv
Age_csp/menagescommunes.csv
Age_csp/menagesdepartements.csv
Files extracted from Alphabetisation_csv.zip:
alphabetisationcommunes.csv
Files extracted from Capital_immobilier_csv.zip:
Capital_immobilier_csv/
Capital_immobilier_csv/isfcommunes.csv
Capital_immobilier_csv/capitalimmobilierdepartements.csv
Capital_immobilier_csv/capitalimmobilier.csv
Capital_immobilier_csv/basesfiscalescommunes.csv
Capital_immobilier_csv/terrescommunes.csv
Capital_immobilier_csv/basesfiscalesdepartements.csv
Capital_immobilier_csv/capitalimmobiliercommunes.csv
Files extracted from Enseignement_prive_csv.zip:
Enseignement_prive_csv/
Enseignement_prive_csv/religiositecantons1791.csv
Enseignement_prive_csv/religiositecommunes1791.csv
Enseigne

In [8]:
# Nettoyage des contrôles
folders = [f for f in os.listdir('data_csv/controles') if os.path.isdir(os.path.join('data_csv/controles', f))]
for folder in folders:
    if folder.endswith('_csv'):
        old_path = os.path.join('data_csv/controles', folder)
        new_folder_name = folder[:-4]
        new_path = os.path.join('data_csv/controles', new_folder_name)
        os.rename(old_path, new_path)
        print(f"Renamed: {folder} -> {new_folder_name}")
        
print("Folder renaming complete.")

Renamed: Capital_immobilier_csv -> Capital_immobilier
Renamed: CSP_csv -> CSP
Renamed: Enseignement_prive_csv -> Enseignement_prive
Renamed: Proprietaires_csv -> Proprietaires
Renamed: Diplomes_csv -> Diplomes
Renamed: Revenus_csv -> Revenus
Renamed: Nationalites_csv -> Nationalites
Renamed: Taille_agglo_commune_csv -> Taille_agglo_commune
Folder renaming complete.


In [None]:
shutil.rmtree('data_zip')
print('Downloaded data removed.')

In [None]:
# Conversion au format R
def csv_to_rda(input_csv_path, output_rda_path):
    # Read CSV file
    data = pd.read_csv(input_csv_path, low_memory=False)

    # Write to RDA file
    pyreadr.write_rdata(output_rda_path, data, compress='gzip')

def convert_csv_files(input_folder, output_folder):
    # Walk through the directory tree
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".csv"):
                # Build input and output paths
                input_csv_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_csv_path, input_folder)
                output_rda_path = os.path.join(output_folder, os.path.splitext(relative_path)[0] + ".rda")

                # Create output directory if it doesn't exist
                os.makedirs(os.path.dirname(output_rda_path), exist_ok=True)

                # Convert CSV to RDA
                csv_to_rda(input_csv_path, output_rda_path)

if __name__ == "__main__":
    input_folder = "data_csv"
    output_folder = "data_rda"

    convert_csv_files(input_folder, output_folder)

In [None]:
shutil.rmtree('data_csv')
print('Extracted data removed.')