 # Merging Datasets, Cleaning and Filtering Dataframe, & Diet Function

In [2]:
# Importing packages
import pandas as pd
import re 

In [3]:
# Creating and cleaning data
def drop_nan_columns(df):
    # Drop columns where all values are NaN
    df_cleaned = df.dropna(axis=1, how='all')
    return df_cleaned

# USDA Data for Prices
usda_data = pd.read_csv("USDA-305tj(Sheet1).csv")
# Trader Joes Data for Ingredients
tj_data = pd.read_csv('trader_joes.csv')
tj_data.rename(columns={"gtin_upc":"GTIN/UPC"}, inplace=True)

# Merging data
merged_data = pd.merge(usda_data, tj_data, how = 'outer', on = 'GTIN/UPC' )
merged_data = drop_nan_columns(merged_data)

# Data of complete pricing
completed_prices = pd.read_csv("EEP153_COMPLETED_PRICES.csv")
completed_prices['GTIN/UPC'] = completed_prices['GTIN/UPC'].astype(int)

# Dropping unneeded columns and duplicates
not_needed = ['branded_food_category', 'data_source', 'modified_date',
            'available_date', 'market_country', 'Unnamed: 0', 'brand_owner', 'Market Country',
            'brand_name', 'Brand Owner', 'Brand', 'fdc_id']

merged_data_drop = merged_data.drop(not_needed, axis = 1)
merged_data_drop.drop_duplicates()

# Setting index, and cleaning data
merged_data_drop = merged_data_drop.set_index('Name')
merged_data_clean =  merged_data_drop[merged_data_drop.index.notna()]
merged_data_clean = merged_data_clean.reset_index().drop(['Price', 'Name'], axis = 1)
merged_data_clean = merged_data_clean[~merged_data_clean['GTIN/UPC'].duplicated(keep='first')]

# Final data frame, before cleaning
final_data = pd.merge(completed_prices, merged_data_clean, on = 'GTIN/UPC', how = 'left')

# Cleaning Data Frame
final_data['ingredients'] = final_data.apply(
    lambda row: f"{row['ingredients_x']}, {row['ingredients_y']}" if pd.notna(row['ingredients_x']) and pd.notna(row['ingredients_y']) 
    else row['ingredients_x'] if pd.notna(row['ingredients_x']) 
    else row['ingredients_y'], axis=1
)

# Filling in missing ingredients
fill_ingredients = ['WHEAT FLOUR, SUGAR, SALT, BARLEY MALT SYRUP',\
                     'POPCORN, SUNFLOWER OIL, SUGAR, SALT', 'SOLID WHITE TUNA, WATER, SALT']

final_data.loc[final_data['ingredients'].isna(), 'ingredients'] = fill_ingredients
final_data = final_data.drop(['ingredients_x', 'ingredients_y'], axis = 1).set_index('Name')

# Final dataset
final_data

Unnamed: 0_level_0,GTIN/UPC,Price,Branded Food Category,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,package_weight,ingredients
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"POTATO CHIPS, SEA SALT",5487,3.99,"Chips, Pretzels & Snacks",,28.0,g,1 ONZ,,"UNPEELED POTATOES, SUNFLOWER OIL, SEA SALT."
"TRADER JOE'S, CORNICHONS",6163,2.99,"Pickles, Olives, Peppers & Relishes",,28.0,g,1 ONZ,,"GHERKINS (CUCUMBERS), WATER, VINEGAR, SALT, ON..."
"ORGANIC TOMATOES, DICED IN TOMATO JUICE",10894,1.99,Tomatoes,,130.0,g,0.5 cup,,"ORGANIC TOMATOES, ORGANIC TOMATO JUICE FROM CO..."
REDUCED FAT MAYO DRESSING,14816,4.99,Salad Dressing & Mayonnaise,,15.0,g,,,"WATER, EXPELLER PRESSED CANOLA OIL, CORNSTARCH..."
"PEANUT BUTTER, CRUNCHY SALTED",14885,2.49,Nut & Seed Butters,,32.0,g,,16 oz/1 lbs/454 g,"DRY ROASTED PEANUTS, SALT."
...,...,...,...,...,...,...,...,...,...
SALTED CARAMEL GELATO,5100000875,3.79,Ice Cream & Frozen Yogurt,,99.0,g,0.5 cup,,"SKIM MILK, CREAM, GLUCOSE SYRUP (CORN), CANE S..."
GROUND BEEF,41498112103,7.49,Other Meats,,112.0,g,4 ONZ,,GROUND BEEF.
WISCONSIN SHARP CHEDDAR CHEESE,227871000000,4.99,Cheese,,,,,,"PASTEURIZED MILK, CHEESE CULTURES, SALT, VEGET..."
COCONUT OIL,750456000000,4.99,Vegetable & Cooking Oils,,,,,,COCONUT


In [4]:
# Function to filter out dataframe depending on certain strings
def remove_rows(df, column_name, search_strings):
    # Convert all values in the specified column to strings
    df[column_name] = df[column_name].apply(lambda x: str(x) if x is not None else "")
    # Create a regex pattern to match any of the search strings as whole words
    search_pattern = r'|'.join([r'\b' + re.escape(search_string) + r'\b' for search_string in search_strings])
    # Filtering out rows where the column contains any of the search strings
    df_filtered = df[~df[column_name].str.contains(search_pattern, case=False, na=False, regex=True)]
    return df_filtered

# Vegetarian Example
meats_and_fish = ['CHICKEN', 'BEEF', 'HAM', 'PORK', 'FISH', 'TURKEY', 'SALMON', 'TUNA']
remove_rows(final_data, 'ingredients', meats_and_fish)

# Vegan Example
animal_products = ['CHICKEN', 'BEEF', 'HAM', 'PORK', 'FISH','TURKEY', 'SALMON', 'TUNA',
                    'MILK', 'BUTTER', 'EGG', 'EGGS', 'HONEY', 'CHEESE', 'YOGURT', 'CREAM', 'GELATINE']

remove_rows(final_data, 'ingredients', animal_products)

# Function to keep certain values depending on certain strings
def keep_rows(df, column_name, search_strings):
    # Converts all values to strings
    df[column_name] = df[column_name].apply(lambda x: str(x) if x is not None else "")
    # Create a regex pattern for exact matches
    search_pattern = r'|'.join([r'\b' + re.escape(search_string) + r'\b' for search_string in search_strings])
    # Filter rows where the column matches any of the exact strings in search_strings
    filtered_df = df[df[column_name].str.contains(search_pattern, case=False, na=False)]
    return filtered_df

# Carnivore example
keep_rows(final_data, 'ingredients', meats_and_fish)

Unnamed: 0_level_0,GTIN/UPC,Price,Branded Food Category,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,package_weight,ingredients
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WILD ALASKAN PINK SALMON,35927,3.99,Canned Seafood,,63.0,g,,,"ALASKAN PINK SALMON, SALT."
PREMIUM CHUNK WHITE CHICKEN IN BROTH,38409,1.99,Canned Meat,,61.0,g,,,"WHITE CHICKEN, CHICKEN BROTH, SALT."
"TRADER JOE'S, MUSHROOM & HERB RISOTTO",99080,3.79,Rice,,42.0,g,,,"SUPERFINO ARBORIO RICE, ONIONS***, MUSHROOM***..."
"TRADER JOE'S, PREMIUM CHUNK WHITE CHICKEN IN WATER",434652,3.99,Canned Meat,,71.0,g,0.33 cup,,"CHICKEN BREAST MEAT, WATER, SALT."
TURKEY CHILI WITH BEANS,479363,2.69,Chili & Stew,,247.0,g,1 cup,,"WATER, DARK TURKEY MEAT, BEANS (RED AND/OR PIN..."
"TRADER JOE'S, UNCURED BACON JAM, BACON, BACON",545198,4.49,Canned Meat,,15.0,g,1 Tbsp,,COOKED APPLEWOOD SMOKED UNCURED BACON NO NITRA...
OVEN ROASTED TURKEY BREAST,815949,5.99,"Pepperoni, Salami & Cold Cuts",,56.0,g,,,"TURKEY BREAST, WATER, CONTAINS LESS THAN 2% OF..."
"TRADER JOE'S, SMOKED TURKEY BREAST",815963,5.99,"Pepperoni, Salami & Cold Cuts",,56.0,g,,,"TURKEY BREAST, WATER, CONTAINS LESS THAN 2% OF..."
UNCURED BLACK FOREST HAM,868631,4.69,"Pepperoni, Salami & Cold Cuts",,56.0,g,,,"PORK, WATER, VINEGAR, KOSHER SALT, TURBINADO S..."
"TRADER JOE'S, SAVORY BROTH, CHICKEN, CHICKEN",908597,1.99,Canned Soup,"NOT A SIGNIFICANT SOURCE OF TRANS FAT, DIETARY...",9.6,g,,4.06 oz/115 g,"CHICKEN STOCK, MALTODEXTRIN (CORN), NATURAL FL..."
