In [2]:
import pandas as pd
df = pd.read_csv('NLP_ingredients.csv')
df = df[['ID', 'Name', 'NLP_Ingredients']]
df.head()

Unnamed: 0,ID,Name,NLP_Ingredients
0,71247,Cherry Streusel Cobbler,"21 ounce cherry pie filling, 2 egg, 14 ounce c..."
1,76133,Reuben and Swiss Casserole Bake,"0.51 lb corn beef, , 0.25 cup thousand island ..."
2,503816,Yam-Pecan Recipe,"0.75 cup unsalted butter, , 0.5 cup sugar, 0.5..."
3,418749,Tropical Orange Layer Cake,"18 ounce .orange cake mix, 3 ounce . instant v..."
4,392934,"Safe to Eat Raw Chocolate Chip Oreo Cookie ""do...","0.5 cup butter, , 0.5 cup brown sugar, 0.25 cu..."


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
from joblib import Parallel, delayed

nutrient_df = pd.read_csv('nutrition.csv')

# Conversion factors to grams (approximate values)
conversion_factors = {
    'cup': 240,
    'tablespoon': 15,
    'teaspoon': 5,
    'ounce': 28.35,
    'pound': 453.59,
}

def convert_to_grams(quantity, unit):
    if unit in conversion_factors:
        return quantity * conversion_factors[unit]
    else:
        return 100  # Assume 100 grams if no conversion factor is found

def extract_quantity_and_unit(text):
    match = re.match(r'(\d*\.?\d+)\s*(\w+)', text)
    if match:
        quantity = float(match.group(1))
        unit = match.group(2).lower()
        return quantity, unit
    else:
        return None, None

def convert_measurements_to_grams(text):
    quantity, unit = extract_quantity_and_unit(text)
    if quantity is not None and unit is not None:
        grams = convert_to_grams(quantity, unit)
        return grams
    else:
        return 100  # Default to 100 grams if no valid measurement is found

# Vectorize the ingredient names
vectorizer = TfidfVectorizer()
ingredient_vectors = vectorizer.fit_transform(nutrient_df['name'])

# Train a NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', n_jobs=-1)
nn_model.fit(ingredient_vectors)

def get_nutrient_info(ingredient):
    ingredient_vector = vectorizer.transform([ingredient])
    _, indices = nn_model.kneighbors(ingredient_vector)
    nutrient_info = nutrient_df.iloc[indices[0][0]].copy()
    
    # Convert nutrient values to float and handle units like "g" or "mg"
    for col in nutrient_info.index[2:]:
        value = nutrient_info[col]
        if isinstance(value, str):
            value = value.replace('g', '').replace('mg', '').strip()
            try:
                nutrient_info[col] = float(value)
            except ValueError:
                nutrient_info[col] = float('nan')
    
    return nutrient_info

def process_ingredient(ingredient):
    grams = convert_measurements_to_grams(ingredient)
    nutrient_info = get_nutrient_info(ingredient)
    scaling_factor = grams / 100  # Scale based on 100 grams standard
    return nutrient_info, scaling_factor

def extract_nutrients(df):
    nutrient_columns = nutrient_df.columns[2:] # Exclude name and serving_size columns
    for col in nutrient_columns:
        df[col] = 0
    
    for i, row in df.iterrows():
        ingredients = row['NLP_Ingredients'].split(', ')
        results = Parallel(n_jobs=-1)(delayed(process_ingredient)(ingredient) for ingredient in ingredients)
        
        for nutrient_info, scaling_factor in results:
            for col in nutrient_columns:
                df.at[i, col] += float(nutrient_info[col]) * scaling_factor
    
    return df

# Sample DataFrame with recipes
data = {
    'ID': [71247, 76133, 503816, 418749, 392934],
    'Name': ['Cherry Streusel Cobbler', 'Reuben and Swiss Casserole Bake', 'Yam-Pecan Recipe', 'Tropical Orange Layer Cake', 'Safe to Eat Raw Chocolate Chip Oreo Cookie "do...'],
    'NLP_Ingredients': [
        '21 ounce cherry pie filling, 2 egg, 14 ounce c...',
        '0.51 lb corn beef, , 0.25 cup thousand island ...',
        '0.75 cup unsalted butter, , 0.5 cup sugar, 0.5...',
        '18 ounce .orange cake mix, 3 ounce . instant v...',
        '0.5 cup butter, , 0.5 cup brown sugar, 0.25 cu...'
    ]
}
df = pd.DataFrame(data)

# Apply the function to extract nutrients
df_with_nutrients = extract_nutrients(df)
print(df_with_nutrients)

  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[

       ID                                               Name  \
0   71247                            Cherry Streusel Cobbler   
1   76133                    Reuben and Swiss Casserole Bake   
2  503816                                   Yam-Pecan Recipe   
3  418749                         Tropical Orange Layer Cake   
4  392934  Safe to Eat Raw Chocolate Chip Oreo Cookie "do...   

                                     NLP_Ingredients  serving_size   calories  \
0  21 ounce cherry pie filling, 2 egg, 14 ounce c...       1092.25  2227.8410   
1  0.51 lb corn beef, , 0.25 cup thousand island ...        260.00   561.4000   
2  0.75 cup unsalted butter, , 0.5 cup sugar, 0.5...        500.00   892.2000   
3  18 ounce .orange cake mix, 3 ounce . instant v...        595.35  2145.8115   
4  0.5 cup butter, , 0.5 cup brown sugar, 0.25 cu...        440.00   824.6000   

   total_fat  saturated_fat  cholesterol  sodium  choline  ...        fat  \
0    76.2513       19.85740          NaN     NaN   

In [9]:
test_df = pd.read_csv('Cleaned_Ingredients/recipes_food_com_cleaned.csv')
columns = [
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
    'SugarContent',
    'ProteinContent'
]
test_df = test_df[columns]
test_df.head()

Unnamed: 0,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
0,801.0,29.1,7.7,93.0,536.5,125.0,3.3,54.4,12.3
1,664.4,45.3,22.1,142.3,2074.2,33.9,5.9,7.1,31.0
2,956.8,53.2,21.7,192.5,664.9,112.8,4.1,63.3,10.7
3,581.6,30.5,17.5,62.4,361.5,74.6,3.5,61.8,6.2
4,121.4,5.9,3.3,10.5,77.8,16.8,0.5,10.3,1.1


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
from joblib import Parallel, delayed

# Sample DataFrame with recipes
data = {
    'ID': [71247, 76133, 503816, 418749, 392934],
    'Name': ['Cherry Streusel Cobbler', 'Reuben and Swiss Casserole Bake', 'Yam-Pecan Recipe', 'Tropical Orange Layer Cake', 'Safe to Eat Raw Chocolate Chip Oreo Cookie "do...'],
    'NLP_Ingredients': [
        '21 ounce cherry pie filling, 2 egg, 14 ounce cherry pie filling',
        '0.51 lb corn beef, 0.25 cup thousand island dressing',
        '0.75 cup unsalted butter, 0.5 cup sugar, 0.5 cup pecans',
        '18 ounce orange cake mix, 3 ounce instant vanilla pudding mix',
        '0.5 cup butter, 0.5 cup brown sugar'
    ]
}
df = pd.DataFrame(data)

nutrient_df = pd.read_csv('ingredients_facts.csv')

# Conversion factors to grams (approximate values)
conversion_factors = {
    'cup': 240,
    'tablespoon': 15,
    'teaspoon': 5,
    'ounce': 28.35,
    'pound': 453.59,
}

def convert_to_grams(quantity, unit):
    if unit in conversion_factors:
        return quantity * conversion_factors[unit]
    else:
        return 100  # Assume 100 grams if no conversion factor is found

def extract_quantity_and_unit(text):
    match = re.match(r'(\d*\.?\d+)\s*(\w+)', text)
    if match:
        quantity = float(match.group(1))
        unit = match.group(2).lower()
        return quantity, unit
    else:
        return None, None

def convert_measurements_to_grams(text):
    quantity, unit = extract_quantity_and_unit(text)
    if quantity is not None and unit is not None:
        grams = convert_to_grams(quantity, unit)
        return grams
    else:
        return 100  # Default to 100 grams if no valid measurement is found

# Vectorize the ingredient names in nutrient_df
vectorizer = TfidfVectorizer()
ingredient_vectors = vectorizer.fit_transform(nutrient_df['nutrient_id'])

# Train a NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', n_jobs=-1)
nn_model.fit(ingredient_vectors)

def get_nutrient_info(ingredient):
    ingredient_vector = vectorizer.transform([ingredient])
    _, indices = nn_model.kneighbors(ingredient_vector)
    nutrient_info = nutrient_df.iloc[indices[0][0]].copy()
    
    # Convert nutrient values to float and handle units like "g" or "mg"
    for col in nutrient_info.index[2:]:
        value = nutrient_info[col]
        if isinstance(value, str):
            value = value.replace('g', '').replace('mg', '').strip()
            try:
                nutrient_info[col] = float(value)
            except ValueError:
                nutrient_info[col] = float('nan')
    
    return nutrient_info

def process_ingredient(ingredient):
    grams = convert_measurements_to_grams(ingredient)
    
    # Extract the ingredient name without the quantity and unit
    ingredient_name = re.sub(r'^\d*\.?\d+\s*\w+\s*', '', ingredient).strip()
    
    nutrient_info = get_nutrient_info(ingredient_name)
    scaling_factor = grams / 100  # Scale based on 100 grams standard
    return nutrient_info, scaling_factor

def extract_nutrients(df):
    nutrient_columns = nutrient_df.columns[2:] # Exclude name and serving_size columns
    for col in nutrient_columns:
        df[col] = 0
    
    for i, row in df.iterrows():
        ingredients = row['NLP_Ingredients'].split(', ')
        results = Parallel(n_jobs=-1)(delayed(process_ingredient)(ingredient) for ingredient in ingredients)
        
        for nutrient_info, scaling_factor in results:
            for col in nutrient_columns:
                df.at[i, col] += float(nutrient_info[col]) * scaling_factor
    
    return df

# Apply the function to extract nutrients
df_with_nutrients = extract_nutrients(df)
print(df_with_nutrients)

  nutrient_df = pd.read_csv('ingredients_facts.csv')


AttributeError: 'int' object has no attribute 'lower'

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
from joblib import Parallel, delayed

nutrient_df = pd.read_csv('nutrition.csv')

# Conversion factors to grams (approximate values)
conversion_factors = {
    'cup': 240,
    'tablespoon': 15,
    'teaspoon': 5,
    'ounce': 28.35,
    'pound': 453.59,
}

def convert_to_grams(quantity, unit):
    if unit in conversion_factors:
        return quantity * conversion_factors[unit]
    else:
        return 100  # Assume 100 grams if no conversion factor is found

def extract_quantity_and_unit(text):
    match = re.match(r'(\d*\.?\d+)\s*(\w+)', text)
    if match:
        quantity = float(match.group(1))
        unit = match.group(2).lower()
        return quantity, unit
    else:
        return None, None

def convert_measurements_to_grams(text):
    quantity, unit = extract_quantity_and_unit(text)
    if quantity is not None and unit is not None:
        grams = convert_to_grams(quantity, unit)
        return grams
    else:
        return 100  # Default to 100 grams if no valid measurement is found

# Vectorize the ingredient names
vectorizer = TfidfVectorizer()
ingredient_vectors = vectorizer.fit_transform(nutrient_df['name'])

# Train a NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', n_jobs=-1)
nn_model.fit(ingredient_vectors)

def get_nutrient_info(ingredient):
    ingredient_vector = vectorizer.transform([ingredient])
    _, indices = nn_model.kneighbors(ingredient_vector)
    nutrient_info = nutrient_df.iloc[indices[0][0]].copy()
    
    # Convert nutrient values to float and handle units like "g" or "mg"
    for col in nutrient_info.index[2:]:
        value = nutrient_info[col]
        if isinstance(value, str):
            value = value.replace('g', '').replace('mg', '').strip()
            try:
                nutrient_info[col] = float(value)
            except ValueError:
                nutrient_info[col] = float('nan')
    
    return nutrient_info

def process_ingredient(ingredient):
    grams = convert_measurements_to_grams(ingredient)
    nutrient_info = get_nutrient_info(ingredient)
    scaling_factor = grams / 100  # Scale based on 100 grams standard
    return nutrient_info, scaling_factor

def extract_nutrients(df):
    nutrient_columns = nutrient_df.columns[2:] # Exclude name and serving_size columns
    for col in nutrient_columns:
        df[col] = 0
    
    for i, row in df.iterrows():
        ingredients = row['NLP_Ingredients'].split(', ')
        results = Parallel(n_jobs=-1)(delayed(process_ingredient)(ingredient) for ingredient in ingredients)
        
        for nutrient_info, scaling_factor in results:
            for col in nutrient_columns:
                df.at[i, col] += float(nutrient_info[col]) * scaling_factor
    
    return df

# Sample DataFrame with recipes
data = {
    'ID': [71247, 76133, 503816, 418749, 392934],
    'Name': ['Cherry Streusel Cobbler', 'Reuben and Swiss Casserole Bake', 'Yam-Pecan Recipe', 'Tropical Orange Layer Cake', 'Safe to Eat Raw Chocolate Chip Oreo Cookie "do...'],
    'NLP_Ingredients': [
        '21 ounce cherry pie filling, 2 egg, 14 ounce c...',
        '0.51 lb corn beef, , 0.25 cup thousand island ...',
        '0.75 cup unsalted butter, , 0.5 cup sugar, 0.5...',
        '18 ounce .orange cake mix, 3 ounce . instant v...',
        '0.5 cup butter, , 0.5 cup brown sugar, 0.25 cu...'
    ]
}
df = pd.DataFrame(data)

# Apply the function to extract nutrients
df_with_nutrients = extract_nutrients(df)
print(df_with_nutrients)

  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[i, col] += float(nutrient_info[col]) * scaling_factor
  df.at[

       ID                                               Name  \
0   71247                            Cherry Streusel Cobbler   
1   76133                    Reuben and Swiss Casserole Bake   
2  503816                                   Yam-Pecan Recipe   
3  418749                         Tropical Orange Layer Cake   
4  392934  Safe to Eat Raw Chocolate Chip Oreo Cookie "do...   

                                     NLP_Ingredients  serving_size   calories  \
0  21 ounce cherry pie filling, 2 egg, 14 ounce c...       1092.25  2227.8410   
1  0.51 lb corn beef, , 0.25 cup thousand island ...        260.00   561.4000   
2  0.75 cup unsalted butter, , 0.5 cup sugar, 0.5...        500.00   892.2000   
3  18 ounce .orange cake mix, 3 ounce . instant v...        595.35  2145.8115   
4  0.5 cup butter, , 0.5 cup brown sugar, 0.25 cu...        440.00   824.6000   

   total_fat  saturated_fat  cholesterol  sodium  choline  ...        fat  \
0    76.2513       19.85740          NaN     NaN   