In [1]:
!pip install chardet
!pip install pandas pyreadr
!pip install pyarrow
import chardet
import os
import pandas as pd
import pyarrow.feather as feather
import pyreadr
import re
import requests
import s3fs
import shutil
from tqdm import tqdm
import zipfile

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0
Collecting pyreadr
  Downloading pyreadr-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading pyreadr-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (440 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.9/440.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pyreadr
Successfully installed pyreadr-0.5.0


In [2]:
# Identification des fichiers csv
def extract_strings_from_webpage(url):
    response = requests.get(url) 
    if response.status_code == 200:
        strings = re.findall(r'"([^"]*)"', response.text)
        return strings
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return []

webpage_url = "https://unehistoireduconflitpolitique.fr/telecharger.html"
extracted_strings = extract_strings_from_webpage(webpage_url)
download_links = [item for item in extracted_strings if item.endswith("csv.zip") or item.endswith("csp.zip")]
print(f"Identified {len(download_links)} files to download.")

Identified 67 files to download.


In [3]:
# Téléchargement des fichiers
os.makedirs('data_zip', exist_ok=True)

progress_bar = tqdm(total=len(download_links), desc="Downloading", unit="file")

for link in download_links:
    try:
        file_name = os.path.join('data_zip', os.path.basename(link))
        response = requests.get(link)
        with open(file_name, 'wb') as file:
            file.write(response.content)
        progress_bar.update(1)
    except Exception as e:
        print(f"Error downloading {link}: {e}")
        
progress_bar.close()

Downloading: 100%|██████████| 67/67 [00:38<00:00,  1.74file/s]


In [4]:
# Caclul de la taille des données téléchargées
try:
    total_size = 0
    for foldername, subfolders, filenames in os.walk('data_zip'):
        for filename in filenames:
            filepath = os.path.join(foldername, filename)
            total_size += os.path.getsize(filepath)
    total_size_mb = total_size / (1024 * 1024)
    print(f'Total size of downloaded files: {total_size_mb:.2f} MB.')
except Exception as e:
    print(f'An error occurred: {e}')

Total size of downloaded data: 2063.75 MB.


In [5]:
# Extraction des résultats électoraux
for prefix in ['pres', 'leg', 'ref']:
    prefix_dir = os.path.join('data_csv', 'Elections_' + prefix)
    os.makedirs(prefix_dir, exist_ok=True)

zip_files = [file for file in os.listdir('data_zip') if file.endswith('.zip')]
total_zip_files = sum(file.startswith(prefix) for prefix in ['pres', 'leg', 'ref'] for file in zip_files)
progress_bar = tqdm(total=total_zip_files, desc="Extracting", unit="file")

for prefix in ['pres', 'leg', 'ref']:
    for file in zip_files:
        if file.startswith(prefix) and file.endswith('.zip'):
            try:
                zip_file_path = os.path.join('data_zip', file)
                prefix_dir = os.path.join('data_csv', 'Elections_' + prefix)
                with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                    for member in zip_ref.infolist():
                        if member.filename.lower().endswith('.csv'):
                            target_path = os.path.join(prefix_dir, os.path.basename(member.filename))
                            with zip_ref.open(member) as source, open(target_path, 'wb') as dest:
                                shutil.copyfileobj(source, dest)
                os.remove(zip_file_path)
                progress_bar.update(1)
            except Exception as e:
                print(f"Error converting {file}: {e}")
            
progress_bar.close()

Extracting: 100%|██████████| 57/57 [00:10<00:00,  5.31file/s]


In [6]:
# Nettoyage des résultats électoraux
for root, dirs, files in os.walk('data_csv'):
    for file_name in files:
        if file_name.startswith("._"):
            file_path = os.path.join(root, file_name)
            try:
                os.remove(file_path)
            except Exception as e:
                print(f"Error deleting file {file_path}: {e}")
print("Deletion of extra files completed.")

Deletion of extra files completed.


In [7]:
# Extraction des contrôles
zip_files = [f for f in os.listdir('data_zip') if f.endswith('.zip')]
total_zip_files = sum(not any(file.startswith(prefix) for prefix in ['pres', 'leg', 'ref']) for file in zip_files)
progress_bar = tqdm(total=total_zip_files, desc="Extracting", unit="file")

for zip_file in zip_files:
    if zip_file.startswith(('pres', 'leg', 'ref')):
        continue
    zip_path = os.path.join('data_zip', zip_file)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            file_list = [file for file in zip_ref.namelist() if not file.startswith('__MACOSX')]
            zip_ref.extractall('data_csv', members=file_list)
            os.remove(zip_path)            
            progress_bar.update(1)
    except Exception as e:
        print(f"Error extracting {zip_file}: {e}")

progress_bar.close()

Extracting: 100%|██████████| 10/10 [00:30<00:00,  3.00s/file]


In [8]:
# Nettoyage des contrôles
folders = [f for f in os.listdir('data_csv') if os.path.isdir(os.path.join('data_csv', f))]
for folder in folders:
    if folder.endswith('_csv'):
        old_path = os.path.join('data_csv', folder)
        new_folder_name = folder[:-4]
        new_path = os.path.join('data_csv', new_folder_name)
        os.rename(old_path, new_path)

shutil.move('data_csv/alphabetisationcommunes.csv', 'data_csv/Diplomes/')

print("Folder cleaning complete.")

Folder cleaning complete.


In [9]:
# Caclul de la taille des données extraites
try:
    total_size = 0
    for foldername, subfolders, filenames in os.walk('data_csv'):
        for filename in filenames:
            filepath = os.path.join(foldername, filename)
            total_size += os.path.getsize(filepath)
    total_size_mb = total_size / (1024 * 1024)
    print(f'Total size of extracted data: {total_size_mb:.2f} MB.')
except Exception as e:
    print(f'An error occurred: {e}')

Total size of extracted data: 6580.59 MB.


In [10]:
# Suppression du répertoire de téléchargement
shutil.rmtree('data_zip')
print('Downloaded data removed.')

Downloaded data removed.


In [11]:
# Conversion au format Feather (avec détection automatique de l'encodage, sans compression)
total_csv_files = 0
for root, dirs, files in os.walk('data_csv'):
    csv_files = [file for file in files if file.endswith(".csv")]
    total_csv_files += len(csv_files)
progress_bar = tqdm(total=total_csv_files, desc="Converting", unit="file")

for root, dirs, files in os.walk('data_csv'):
    for file in files:
        if file.endswith(".csv"):
            try:
                input_csv_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_csv_path, 'data_csv')
                output_feather_path = os.path.join('data_feather', os.path.splitext(relative_path)[0] + ".feather")
                os.makedirs(os.path.dirname(output_feather_path), exist_ok=True)
                # Détection du format d'encodage
                with open(input_csv_path, 'rb') as f:
                    result = chardet.detect(f.read())
                encoding = result['encoding']
                # Lecture du fichier CSV
                data = pd.read_csv(input_csv_path, low_memory=False, encoding=encoding)
                # Écriture du fichier Feather
                feather.write_feather(data, output_feather_path)
                # Affichage de la progression
                progress_bar.update(1)
                # Suppression du fichier CSV initial
                os.remove(input_csv_path)
            except Exception as e:
                print(f"Error converting {input_csv_path}: {e}")

progress_bar.close()

Converting: 100%|██████████| 110/110 [2:11:57<00:00, 71.97s/file]   


In [12]:
# Caclul de la taille des données converties
try:
    total_size = 0
    for foldername, subfolders, filenames in os.walk('data_feather'):
        for filename in filenames:
            filepath = os.path.join(foldername, filename)
            total_size += os.path.getsize(filepath)
    total_size_mb = total_size / (1024 * 1024)
    print(f'Total size of converted data: {total_size_mb:.2f} MB.')
except Exception as e:
    print(f'An error occurred: {e}')

Total size of converted data: 4716.75 MB.


In [13]:
# Suppression du répertoire d'extraction
shutil.rmtree('data_csv')
print('Extracted data removed.')

Extracted data removed.


In [15]:
# Paramétrage du datalab
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [16]:
# Transmission des données vers le datalab
source_directory = 'data_feather'
bucket_name = 'maeldieudonne'
destination_directory = bucket_name + '/diffusion/'

total_files = sum([len(files) for _, _, files in os.walk(source_directory)])
progress_bar = tqdm(total=total_files, desc="Uploading", unit="file")

for root, dirs, files in os.walk(source_directory):
    for file in files:
        source_path = os.path.join(root, file)
        destination_path = os.path.join(destination_directory, os.path.relpath(source_path, source_directory))
        fs.put(source_path, destination_path, content_type='text/csv', encoding='utf-8')
        progress_bar.update(1)

progress_bar.close()

Uploading: 100%|██████████| 110/110 [02:11<00:00,  1.19s/file]


In [None]:
# Suppression du répertoire de conversion
shutil.rmtree('data_feather')
print('Converted data removed.')