In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('nutrition.csv')
df.head()
cleaned_df = df[['name', 'serving_size', 'calories', 'total_fat', 'saturated_fat', 'cholesterol', 'sodium', 'folate',
              'riboflavin', 'vitamin_a', 'vitamin_b12', 'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e',
              'vitamin_k', 'calcium', 'copper', 'irom', 'magnesium', 'manganese', 'potassium', 'zink', 'protein',
              'carbohydrate', 'fiber', 'sugars', 'fat', 'saturated_fatty_acids', 'monounsaturated_fatty_acids',
              'polyunsaturated_fatty_acids', 'alcohol', 'caffeine']]

cleaned_df

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [10]:
def remove_units(value):
    if isinstance(value, str):
        filtered_value = ''.join(filter(lambda x: x.isdigit() or x == '.', value))
        try:
            return float(filtered_value) if filtered_value.replace('.', '', 1).isdigit() else 0.0
        except ValueError:
            return 0.0
    return value

for column in cleaned_df.columns:
    if column != 'name':
        cleaned_df[column] = cleaned_df[column].apply(remove_units)
cleaned_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[column] = cleaned_df[column].apply(remove_units)


Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,folate,riboflavin,vitamin_a,...,protein,carbohydrate,fiber,sugars,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,alcohol,caffeine
0,Cornstarch,100.0,381,0.1,,0.0,9.0,0.0,0.0,0.0,...,0.26,91.27,0.9,0.0,0.05,0.009,0.016,0.025,0.0,0.0
1,"Nuts, pecans",100.0,691,72.0,6.2,0.0,0.0,22.0,0.13,56.0,...,9.17,13.86,9.6,3.97,71.97,6.18,40.801,21.614,0.0,0.0
2,"Eggplant, raw",100.0,25,0.2,,0.0,2.0,22.0,0.037,23.0,...,0.98,5.88,3.0,3.53,0.18,0.034,0.016,0.076,0.0,0.0
3,"Teff, uncooked",100.0,367,2.4,0.4,0.0,12.0,0.0,0.27,9.0,...,13.3,73.13,8.0,1.84,2.38,0.449,0.589,1.071,0.0,0.0
4,"Sherbet, orange",100.0,144,2.0,1.2,1.0,46.0,4.0,0.097,46.0,...,1.1,30.4,1.3,24.32,2.0,1.16,0.53,0.08,0.0,0.0


In [11]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

ingredients = cleaned_df['name']

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_ingredient(ingredient):
    ingredient = ingredient.lower()
    ingredient = re.sub(r'[^\w\s]', '', ingredient)
    ingredient = re.sub(r'\d+', '', ingredient)
    ingredient = " ".join([word for word in ingredient.split() if word not in stop_words])
    ingredient = " ".join([lemmatizer.lemmatize(word) for word in ingredient.split()])
    return ingredient

cleaned_ingredients = [clean_ingredient(ingredient) for ingredient in ingredients]
print(cleaned_ingredients)

['cornstarch', 'nut pecan', 'eggplant raw', 'teff uncooked', 'sherbet orange', 'cauliflower raw', 'taro leaf raw', 'lamb raw ground', 'cheese camembert', 'vegetarian fillet', 'pace picante sauce', 'goji berry dried', 'mango nectar canned', 'cracker rusk toast', 'chicken boiled foot', 'quail raw meat', 'pie lemon fried pie', 'pepper raw jalapeno', 'winged bean tuber raw', 'salami turkey cooked', 'grape raw muscadine', 'nut raw ginkgo nut', 'spice ground savory', 'candy sesame crunch', 'cheese low fat cream', 'pace green taco sauce', 'syrup canadian maple', 'ostrich raw top loin', 'chewing gum sugarless', 'nut dried pine nut', 'pasta unenriched dry', 'mcdonalds side salad', 'cooky marie biscuit', 'broccoli raw chinese', 'mcdonalds hash brown', 'agave raw southwest', 'emu raw outside drum', 'nut dried beechnut', 'currant dried zante', 'lentil raw sprouted', 'gravy mix dry onion', 'pie fruit fried pie', 'snack cake popcorn', 'snack mixed berry bar', 'fish raw sheepshead', 'babyfood pear ju

In [12]:
from rapidfuzz import process

# Sample nutrient database
nutrient_db = ["chicken", "carrot", "tomato", "potato", "broccoli"]

# Match cleaned ingredients to nutrient database
mapped_ingredients = [process.extractOne(ingredient, nutrient_db) for ingredient in cleaned_ingredients]
print(mapped_ingredients)

[('carrot', 45.0, 1), ('carrot', 40.0, 1), ('potato', 45.0, 3), ('chicken', 41.53846153846154, 0), ('tomato', 45.0, 2), ('carrot', 45.0, 1), ('carrot', 57.0, 1), ('carrot', 45.0, 1), ('chicken', 54.0, 0), ('chicken', 34.2, 0), ('carrot', 45.0, 1), ('chicken', 36.0, 0), ('carrot', 51.300000000000004, 1), ('tomato', 65.45454545454547, 2), ('chicken', 90.0, 0), ('tomato', 54.0, 2), ('chicken', 40.0, 0), ('chicken', 36.0, 0), ('chicken', 36.0, 0), ('chicken', 42.75, 0), ('chicken', 41.53846153846154, 0), ('chicken', 38.57142857142858, 0), ('chicken', 51.300000000000004, 0), ('carrot', 45.0, 1), ('chicken', 54.0, 0), ('tomato', 51.300000000000004, 2), ('carrot', 42.75, 1), ('tomato', 45.0, 2), ('chicken', 55.38461538461539, 0), ('chicken', 38.57142857142858, 0), ('potato', 54.0, 3), ('carrot', 36.0, 1), ('carrot', 45.0, 1), ('broccoli', 90.0, 4), ('broccoli', 46.63636363636363, 4), ('carrot', 31.999999999999996, 1), ('carrot', 45.0, 1), ('chicken', 45.0, 0), ('carrot', 54.0, 1), ('carrot', 

In [1]:
import spacy
import numpy as np

# Load spaCy's medium-sized English model, which includes pre-trained word vectors
nlp = spacy.load("en_core_web_md")

# Function to generate an embedding for a single ingredient
def get_spacy_embedding(ingredient_name):
    doc = nlp(ingredient_name)
    return doc.vector  # Extract vector representation of the ingredient

# Example ingredient names after normalization
ingredient_names = cleaned_df['name']
ingredient_embeddings = np.array([get_spacy_embedding(name) for name in ingredient_names])

print("Ingredient Embeddings with spaCy:")
print(ingredient_embeddings)

NameError: name 'cleaned_df' is not defined