# Data Processing and Metadata Enrichment Pipeline

This notebook guides you through a step-by-step process for loading, processing, and analyzing a dataset using a combination of custom scripts. The workflow includes loading data, creating metadata, filtering data, and performing fuzzy matching.

## Steps:
1. Set up environment and import necessary modules.
2. Define and check dataset directory.
3. Load and cache dataframes.
4. Create and display metadata.
5. Fetch, compare, and configure data fields.
6. Filter and process dataframes.
7. Perform fuzzy matching.
8. Save processed data.

## Step 1: Setup Environment

We begin by importing the necessary libraries and functions.


In [2]:

# Import necessary libraries and custom modules
import os
import json
import pandas as pd
import gc
import numpy as np



# Now, import the necessary custom functions from the scripts
from src.scripts.df_generator import get_dataset_directory, check_directory_exists, load_or_cache_dataframes, show_loaded_dfs
from src.scripts.df_metadata import display_metadata_dfs, create_metadata_dfs, enrich_metadata_df
from src.scripts.fetch_data_fields import fetch_and_compare_data_fields
from src.scripts.build_data_fields_config import build_data_fields_config


# Change the current working directory to 'src'
os.chdir(os.path.join(os.getcwd(), 'src'))

FileNotFoundError: [WinError 2] Le fichier spécifié est introuvable: 'c:\\Git\\Mission3\\src\\src'

## Step 2: Define and Check Dataset Directory

Define the dataset directory and ensure it exists. This step is crucial as it sets the working directory for subsequent operations.


In [None]:
# Define the dataset directory
notebook_directory =os.getcwd()  # This points to the root where the notebook is
dataset_directory = os.path.join(notebook_directory, 'dataset')

# Check if the dataset directory exists
if not check_directory_exists(dataset_directory):
    print(f"Error: Directory '{dataset_directory}' does not exist.")
else:
    print(f"Dataset directory found: {dataset_directory}")


## Step 3: Load and Cache DataFrames

Load the data from the dataset directory into pandas DataFrames. The data can be loaded from cache or directly from the source files if the cache is not available.


In [None]:
# Directory to store cached DataFrames
CACHE_DIR = os.path.join(notebook_directory, 'data', 'cache') 

# Optionally, you can define a list of specific files to process
specific_files = ['fr.openfoodfacts.org.products.csv']  # Set to None to process all files

# Load DataFrames from cache or source files
dfs = load_or_cache_dataframes(dataset_directory, CACHE_DIR, file_list=specific_files, separator='\t')

# Check if DataFrames are loaded
if not dfs:
    print("No DataFrames were loaded. Exiting.")
else:
    print(f"Loaded DataFrames: {list(dfs.keys())}")
    show_loaded_dfs(dfs, df_names=None)


## Step 4: Create and Display Metadata

Generate metadata for the loaded DataFrames and display it to understand the structure and content of the data.


In [None]:
# Create metadata DataFrames
metadata_dfs = create_metadata_dfs(dfs)

# Check if metadata DataFrames were created
if not metadata_dfs:
    print("No metadata DataFrames were created. Exiting.")
else:
    print(f"Created Metadata DataFrames: {list(metadata_dfs.keys())}")
    display_metadata_dfs(metadata_dfs)


## Step 5: Fetch, Compare, and Configure Data Fields

Fetch and compare data fields from the dataset, and build the necessary configuration files.


In [None]:
DATA_DIR = os.path.join(notebook_directory,'data')

# Run the fetch and compare data fields script
fetch_and_compare_data_fields(DATA_DIR)

# Build the config file
build_data_fields_config()


## Step 6: Filter and Process DataFrames

Filter the metadata and corresponding DataFrames, and save the filtered data.


In [None]:
# Load the config.json
script_dir = os.path.join(notebook_directory,'scritps')
config_path = os.path.join(notebook_directory, 'config', 'data_fields_config.json')

with open(config_path, 'r') as file:
    config = json.load(file)

# Enrich the metadata DataFrame
combined_metadata = pd.concat(metadata_dfs.values(), keys=metadata_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'DataFrame'})
combined_metadata = enrich_metadata_df(combined_metadata, config)


# Save the combined metadata DataFrame to a CSV file
output_dir = os.path.join(notebook_directory, 'data')
os.makedirs(output_dir, exist_ok=True)

combined_metadata_path = os.path.join(output_dir, 'combined_metadata.csv')
combined_metadata.to_csv(combined_metadata_path, index=False)
print(f"Combined metadata {combined_metadata.shape} has been saved or updated.")


## Step 7: Identify columns cluster

Checking cluster of columns based on Duplicate(%) and Fill(%)



In [None]:
from src.scripts.plot_metadata_clusters import run_dash_app

# Run the Dash app  
run_dash_app(combined_metadata)


In [None]:
from src.scripts.df_filtering import filter_metadata_and_dataframes, process_dataframe
from src.scripts.df_fuzzywuzzy import fuzzy_dataframe


# Specify your datetime checks as a list of tuples
datetime_checks = [
    # ('created_t', 'created_datetime'),
    # ('last_modified_t', 'last_modified_datetime')
]

# Specify your field frequency checks as a list of tuples
field_checks = [
    (['countries', 'countries_tags', 'countries_fr'], 'countries'),
    (['ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n'], 'ingredients_palm_oil'),
    (['nutrition_grade_fr', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g'], 'nutrition'),
    #(['brands_tags', 'brands'], 'brands'),
    #(['additives_n', 'additives', 'additives_tags', 'additives_fr'], 'additives'),
    (['states', 'states_tags', 'states_fr'], 'states')
]

# Columns to check for at least one non-null value
columns_to_check = [
    'nutrition_grade_fr', 'energy_100g', 'fat_100g', 'saturated-fat_100g',
    'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g',
    'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
    'vitamin-c_100g', 'calcium_100g', 'iron_100g', 'nutrition-score-fr_100g',
    'nutrition-score-uk_100g'
]

# Fields to be deleted after anaylysis
fields_to_delete = ['url', 'created_t', 'last_modified_t', 'states', 'states_tags', 'states_fr', 'countries', 'countries_fr', 'brands', 'additives_n', 'additives_tags', 'additives_fr', 'creator','ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n']

# Now process your specific DataFrame
df_name = 'fr.openfoodfacts.org.products'

# Filter and process DataFrame in one step
if df_name in dfs:
    combined_metadata, filtered_dfs = filter_metadata_and_dataframes(combined_metadata, dfs)
    process_dataframe(filtered_dfs[df_name], log_dir='logs', temp_dir='temp', datetime_checks=datetime_checks, field_checks=field_checks)
    fuzzy_dataframe(temp_dir='temp', config_dir='config', checks=field_checks, threshold=90)

    # Drop rows where all specified columns are null
    filtered_dfs[df_name].dropna(subset=columns_to_check, how='all', inplace=True)

    # Drop duplicates after filtering rows
    filtered_dfs[df_name].drop_duplicates(inplace=True)

    # Delete the specified columns from combined_metadata and update the related DataFrame
    combined_metadata = combined_metadata[~combined_metadata['Column Name'].isin(fields_to_delete)]
    filtered_dfs[df_name] = filtered_dfs[df_name][combined_metadata['Column Name']]

    # Save the processed DataFrame to the dataset directory
    dataset_path = os.path.join('dataset', f'processed_{df_name}.csv')
    filtered_dfs[df_name].to_csv(dataset_path, index=False)
    
    # Save the updated metadata to the data directory
    metadata_path = os.path.join('data', f'processed_metadata.csv')
    combined_metadata.to_csv(metadata_path, index=False)
    
    print(f"Processed DataFrame '{df_name}' and metadata have been saved.")
else:
    print(f"DataFrame '{df_name}' not found in the loaded DataFrames.")


In [1]:
from src.scripts.plot_nutriscore import run_dash_app_nutriscore, safe_eval
# Import necessary libraries and custom modules
import os
import pandas as pd


# Change the current working directory to 'src'
os.chdir(os.path.join(os.getcwd(), 'src'))
notebook_directory =os.getcwd()

# Assuming notebook_directory is already defined in your notebook
nutriscore_directory = os.path.join(notebook_directory, 'temp', 'nutrition_combination_log.csv')
nutriscore = pd.read_csv(nutriscore_directory)



run_dash_app_nutriscore(nutriscore)


In [2]:
nutriscore.head()

Unnamed: 0,nutrition_combination,Frequency,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g
1,"(b, 0.0, 0.0)",12667,b,0.0,0.0
2,"(b, 1.0, 1.0)",11014,b,1.0,1.0
3,"(d, 14.0, 14.0)",10518,d,14.0,14.0
4,"(b, 2.0, 2.0)",10131,b,2.0,2.0
5,"(a, -1.0, -1.0)",8783,a,-1.0,-1.0
