<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/2_Recipe1M_Ingredient_Match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting and loading the files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importing required libraries and both the datasets

In [None]:
import pandas as pd
import json
import re
from concurrent.futures import ProcessPoolExecutor

In [None]:
flavorgraph_df = pd.read_csv('/content/drive/My Drive/ERP/Dataset/nodes_191120.csv')  # Adjust the path as needed , this is the path to my personal Google Drive
# Load the Recipe1m dataset from the JSON file
with open('/content/drive/My Drive/ERP/Processed_Layer1.json', 'r') as file:  # Adjust the path as needed , this is the path to my personal Google Drive
    recipe1m_data = [json.loads(line) for line in file]

### Logic sequence to match ingredient names between FlavorGraph and Recipe1M and update/save the Recipe1M

In [None]:
# Extract unique ingredients where node_type is "ingredient"
flavorgraph_ingredients = flavorgraph_df[flavorgraph_df['node_type'] == 'ingredient']['name'].dropna().unique()

# Create a mapping from space-separated to underscore-separated ingredients
ingredient_mapping = {ingredient.replace('_', ' '): ingredient for ingredient in flavorgraph_ingredients if isinstance(ingredient, str)}

# Function to replace space-separated ingredients with underscore-separated ones
def replace_ingredients(text, mapping):
    # Sort keys by length in descending order to handle multi-word ingredients first
    keys_sorted_by_length = sorted(mapping.keys(), key=len, reverse=True)
    for key in keys_sorted_by_length:
        if key in text:
            text = text.replace(key, mapping[key])
    return text

# Function to process the instructions and ingredients list for each recipe
def process_list(item_list, mapping):
    return [replace_ingredients(item, mapping) for item in item_list]

def process_row(row, mapping):
    if isinstance(row['processed_instructions'], list):
        row['processed_instructions'] = process_list(row['processed_instructions'], mapping)
    if isinstance(row['processed_ingredients'], list):
        row['processed_ingredients'] = process_list(row['processed_ingredients'], mapping)
    return row

# Convert to DataFrame for easier manipulation
recipe1m_df = pd.DataFrame(recipe1m_data)

# Convert DataFrame to a list of dictionaries for parallel processing
recipe1m_records = recipe1m_df.to_dict(orient='records')

# Process the DataFrame using parallel processing
def parallel_process(records, mapping):
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(process_row, records, [mapping]*len(records)))
    return results

processed_records = parallel_process(recipe1m_records, ingredient_mapping)

# Convert back to DataFrame
recipe1m_df_processed = pd.DataFrame(processed_records)

# Convert back to JSON format
recipe1m_data_modified = recipe1m_df_processed.to_dict(orient='records')

# Save the modified dataset to JSON
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'w') as file:
    for record in recipe1m_data_modified:
        file.write(json.dumps(record) + '\n')


### Cross-Checking expected counts

In [None]:
# Convert to DataFrame for easier manipulation
recipe1m_df = pd.DataFrame(recipe1m_data)

In [None]:
partition_counts = recipe1m_df['partition'].value_counts()

In [None]:
partition_counts

partition
train    720639
val      155036
test     154045
Name: count, dtype: int64

In [None]:
# Assuming recipe1m_df is already loaded and contains the columns 'partition' and 'processed_instructions'
keywords = ["instead", "substitute", "in place of", "replace"]

# Define a function to find the matching keyword
def find_keywords(text, keywords):
    for keyword in keywords:
        if keyword in text:
            return keyword
    return None

# Filter the DataFrame and find the keywords
recipe1m_df['keyword'] = recipe1m_df['processed_instructions'].apply(
    lambda x: find_keywords(' '.join(x) if isinstance(x, list) else x, keywords))

# Filter out rows where no keyword was found
filtered_df = recipe1m_df[recipe1m_df['keyword'].notna()]

# Group by partition and keyword, then get the counts
partition_keyword_counts = filtered_df.groupby(['partition', 'keyword']).size().reset_index(name='counts')

# Display the counts
print(partition_keyword_counts)

   partition      keyword  counts
0       test  in place of     230
1       test      instead    1320
2       test      replace     739
3       test   substitute    1691
4      train  in place of    1098
5      train      instead    6522
6      train      replace    3656
7      train   substitute    7704
8        val  in place of     255
9        val      instead    1446
10       val      replace     776
11       val   substitute    1636
