In [1]:
%pip install pandas

import gc
from enum import Enum
import pandas as pd

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Note: you may need to restart the kernel to use updated packages.


In [2]:
input_parquet_file_path = '../data/interm/2.2_taxdata.parquet'
input_columns_file_path = '../data/interm/3.0_columns.csv'

output_file_path = '../data/interm/4.0_'

In [3]:
class RevenueCollector(Enum):
    COMMUNE = "COMMUNE"
    SYNDICAT = "SYNDICAT"
    INTERCOMMUNE = "INTERCOMMUNE"
    DEPARTMENT = "DEPARTMENT"
    REGION = "REGION"
    STATE = "STATE"
    OTHER = "OTHER"
    MGP = "MGP"
    UNAVAILABLE="UNAVAILABLE"

data_df = pd.read_parquet(input_parquet_file_path)
columns_df = pd.read_csv(input_columns_file_path)

data_df['ANNEE'] = data_df['ANNEE'].astype(str)
data_df['ANNEE'] = pd.to_datetime(data_df['ANNEE']).dt.year

In [4]:
code_insee = 'Clean code INSEE'
identifiers = ['code INSEE', 'LABEL', 'DEPARTEMENT_LABEL', 'DEPARTEMENT', 'COMMUNE', 'ANNEE', 'POPULATION TOTALE (MUNICIPALE ET COMPTEE A PART)  - Source saisies application FDL']


# Define the conversion factor. Before 2002 France was using the franch franc. 
# https://www.ecb.europa.eu/euro/exchange/fr/html/index.en.html
conversion_factor = 6.55957

# Apply the conversion rate franc to euro and save the results

In [5]:
to_save_data_types = [RevenueCollector.COMMUNE.value,
                      RevenueCollector.SYNDICAT.value,
                      RevenueCollector.INTERCOMMUNE.value,
                      RevenueCollector.DEPARTMENT.value,
                      RevenueCollector.REGION.value, 
                      RevenueCollector.STATE.value, 
                      RevenueCollector.MGP.value]

for data_type in to_save_data_types: 
    # Create a mask to filter columns for COMMUNE revenue
    tax_mask = columns_df['REVENUE_COLLECTOR'] == data_type
    tax_columns = columns_df[columns_df['REVENUE_COLLECTOR'] == data_type]['TITRE'].values

    # Make sure the data in the tax columns is numeric
    data_df[tax_columns] = data_df[tax_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

    # Select all the columns we need from the dataset
    columns = identifiers + [code_insee] + list(tax_columns)

    # Select the necessary columns
    tax_revenue_df = data_df[columns]

    # Identify rows where the year is before 2002
    # Apply the euro-franc coversion to the tax columns for those rows: https://en.wikipedia.org/wiki/French_franc#:~:text=of%20the%20Euro.-,Economic%20and%20Monetary%20Union,January%20and%201%20March%202002.
    year_mask = tax_revenue_df['ANNEE'] < 2002
    for col in tax_columns:
        tax_revenue_df.loc[year_mask & (tax_revenue_df[col] != 0), col] = tax_revenue_df.loc[year_mask & (tax_revenue_df[col] != 0), col] / conversion_factor

    # Save the selection of columns to a new parquet file
    tax_revenue_df.to_parquet(output_file_path + data_type.lower() + '.parquet')

    # Uncomment this to save a sample of each file
    sample = tax_revenue_df.sample(n=1000, random_state=42)  
    sample.to_csv(output_file_path + data_type.lower() + '_sample.csv')

In [6]:
del data_df
del columns_df

gc.collect()

0