In [2]:
import re
import spacy
import pandas as pd
import os

# List of terms to exclude
exclusions = [
    "halve", "halved", "lengthwise", "rough", "roughly", "chop", "chopped", "fresh", "freshly", 
    "crack", "cracked", "divide", "divided", "mince", "minced", "rinse", "rinsed", "dry", "dried", 
    "clean", "cleaned", "cut", "cutting", "slice", "sliced", "quarter", "quartered", "pack", 
    "packed", "beat", "beaten", "spray", "sprayed", "coarse", "coarsely", "medium", "fine", 
    "small", "large", "whole", "cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons", 
    "pound", "pounds", "ounce", "ounces", "gram", "grams", "liter", "liters", "milliliter", 
    "milliliters", "clove", "cloves", "stick", "sticks", "package", "packages", "can", "cans", 
    "bottle", "bottles", "slice", "slices", "bulb", "bulbs", "finely", "medium-sized", "large-sized", 
    "small-sized", "extra-large", "extra-small", "half", "quarter", "third", "fourth", "fifth", 
    "sixth", "seventh", "eighth", "ninth", "tenth", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
    "10", "¼", "½", "¾", "⅛", "⅓", "⅔", "⅕", "⅖", "⅗", "⅘", "⅙", "⅚", "⅐", "⅛", "⅜", "⅝", 
    "⅞", "⅑", "⅒", "à", "á", "â", "ã", "ä", "å", "æ", "a", "dice", "diced", "crush", "crushed", "regular", "bundles", "bundle"
]

# Simplified function to clean extracted entities
def clean_text(text):
    text = str(text)  # Ensure the text is a string
    text = text.replace('\xa0', ' ')  # Replace non-breaking space with a regular space
    text = text.replace('xa0', '')    # Explicitly remove any occurrence of 'xa0'
    text = re.sub(r'xa0', '', text)   # Remove any occurrence of 'xa0' in the text
    text = re.sub(r'[^\w\s]', '', text)  # Remove any non-alphanumeric characters except for whitespace
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip leading/trailing spaces
    return text

# Define the ingredients to replace
salt_variants = ["salt", "kosher salt", "sea salt"]
pepper_variants = ["black pepper", "pepper"]
oil_variants = ["oil", "olive oil", "extra virgin olive oil", "extravirgin olive oil", "extravirginoliveoil"]

# Function to clean ingredients
def clean_ingredients(ingredients):
    ingredients_list = [ingredient.strip() for ingredient in ingredients.split(',')]
    has_salt = any(ingredient in salt_variants for ingredient in ingredients_list)
    has_pepper = any(ingredient in pepper_variants for ingredient in ingredients_list)
    has_oil = any(ingredient in oil_variants for ingredient in ingredients_list)
    
    cleaned_ingredients = []
    
    for ingredient in ingredients_list:
        if ingredient in salt_variants:
            continue
        if ingredient in pepper_variants:
            continue
        if ingredient in oil_variants:
            continue
        cleaned_ingredients.append(ingredient)
    
    if has_salt and has_pepper:
        if has_oil:
            cleaned_ingredients.append("SPO")
        else:
            cleaned_ingredients.append("SP")
    elif has_oil:
        cleaned_ingredients.append("O")
    
    return ', '.join(cleaned_ingredients)

# Function to exclude specific terms
def exclude_terms(ents, exclusions):
    return [ent for ent in ents if ent.lower() not in exclusions]

# Function to display the results
def display_results(names, urls, test_ingredients, nlp):
    results = []
    for name, url, ingredients in zip(names, urls, test_ingredients):
        doc = nlp(ingredients)
        # Exclude entities labeled as "NON-ENTITY"
        extracted_ents = [clean_text(ent.text) for ent in doc.ents if ent.label_ != 'NON-ENTITY' and clean_text(ent.text)]
        unique_ents = list(dict.fromkeys(extracted_ents))  # Remove duplicates while preserving order
        unique_ents = exclude_terms(unique_ents, exclusions)  # Exclude terms from the list
        cleaned_ingredients = clean_ingredients(', '.join(unique_ents))
        results.append((clean_text(name), url, cleaned_ingredients))
    
    results_df = pd.DataFrame(results, columns=['name', 'url', 'ingredients'])
    return results_df

# Load your trained model directly from the specified path
model_path = 'ner_model_test_class'
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model path '{model_path}' does not exist. Please check the path.")

nlp = spacy.load(model_path)

# Path to your raw data file
raw_data_path = '../Cleaned_Raw_Data/MASTERDATA1.csv'

# Load the raw data
df = pd.read_csv(raw_data_path)

# Ensure all values in the ingredients column are strings and handle missing values
df['ingredients'] = df['ingredients'].astype(str).fillna('')

# Extract relevant columns
names = df['name'].tolist()
urls = df['url'].tolist()
test_ingredients = df['ingredients'].tolist()

# Display the results
results_df = display_results(names, urls, test_ingredients, nlp)

# Print the DataFrame
print(results_df.head())

# Optionally, save the DataFrame to a CSV file
output_filename = '../SpaCy_Extracted_Data/NER_Model_MASTERDATA_Class.csv'
results_df.to_csv(output_filename, index=False)

                                                name  \
0  A Piece of Turkey Easy Roasted Thanksgiving Tu...   
1                       Aarti Party aka Savory Sling   
2                         Almost Famous Corn Pudding   
3                                        a Great Pye   
4                        Almost My Grandmas Rouladen   

                                                 url  \
0  https://www.foodnetwork.com/recipes/jeff-mauro...   
1  https://www.foodnetwork.com/recipes/aarti-part...   
2  https://www.foodnetwork.com/recipes/michael-sy...   
3  https://www.foodnetwork.com/recipes/a-great-py...   
4  https://www.foodnetwork.com/recipes/melissa-da...   

                                         ingredients  
0  turkey, celery, yellow onion, rosemary, sage, ...  
1  vodka, Zubrowka, Pineapple Reduction, pickling...  
2  corn, boxes cornbread mix, eggs, corn kernels,...  
3  shortcrust pastry, liquid, chicken breast, bee...  
4  bacon, flatiron steak, ground pepper, Dijon mu..

In [3]:
import pandas as pd
import json

# Load the CSV data
csv_file_path = '../SpaCy_Extracted_Data/NER_Model_MASTERDATA_Class.csv'
df = pd.read_csv(csv_file_path)

# Define the entity mapping function
def tag_entities(ingredients):
    if isinstance(ingredients, str):
        ingredients_list = ingredients.split(', ')
        tagged_ingredients = [{"ingredient": ing, "entity": "ingredient"} for ing in ingredients_list]
        return tagged_ingredients
    else:
        return []

# Apply the entity tagging function to the ingredients column
df['tagged_ingredients'] = df['ingredients'].apply(tag_entities)

# Convert the DataFrame to a list of dictionaries
records = df[['name', 'url', 'tagged_ingredients']].to_dict(orient='records')

# Save the structured data to a JSON file
json_file_path = '../SpaCy_Extracted_Data/PINECONE_TAGGED_MASTERDATA_Class.json'
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=4)

print(f"Tagged data saved to {json_file_path}")


Tagged data saved to ../SpaCy_Extracted_Data/PINECONE_TAGGED_MASTERDATA_Class.json
