In [1]:
import pandas as pd
import re
import numpy as np

# Read the original cleaned CSV file
df = pd.read_csv('NEWTJBase.csv')

# Function to convert package_size values to standard weight/volume in grams or milliliters.
def convert_package_size(val):
    if not isinstance(val, str) or val.strip() == "":
        return val
    # Expected format: "number unit"
    m = re.match(r"([\d\.]+)\s*(\D+)", val)
    if not m:
        return val
    number_str, unit = m.groups()
    try:
        number = float(number_str)
    except Exception as e:
        return val
    
    # Normalize the unit string (case-insensitive)
    unit_normalized = unit.strip().lower()
    
    # For Each or Dozen, delete the value.
    if unit_normalized in ['each', 'doz', 'dozen']:
        return ""
    elif unit_normalized == 'oz':
        converted = number * 28.35
        return f"{converted:.2f} g"
    elif unit_normalized in ['lb', 'pound', 'lbs']:
        converted = number * 453.59
        return f"{converted:.2f} g"
    elif unit_normalized in ['fl oz', 'fluid oz']:
        converted = number * 29.57
        return f"{converted:.2f} ml"
    elif unit_normalized == 'pint':
        converted = number * 473.18
        return f"{converted:.2f} ml"
    elif unit_normalized in ['qt', 'quart']:
        converted = number * 946.35
        return f"{converted:.2f} ml"
    else:
        # If the unit isn't one we convert, return the original value.
        return val

# Process the package_size column if it exists.
if "package_size" in df.columns:
    df["package_size"] = df["package_size"].apply(convert_package_size)

# Drop all columns after the first 20.
df = df.iloc[:, :20]

# Function to remove "/" and any subsequent text from a string.
def remove_slash_and_after(val):
    if isinstance(val, str):
        # Remove the slash and everything after it, then strip trailing spaces.
        return re.sub(r"/.*", "", val).strip()
    return val

# Apply the removal function to every cell in the DataFrame.
df = df.apply(lambda col: col.map(remove_slash_and_after))

# Standardize units in cells
def standardize_cell(cell):
    """
    Extracts the numeric value from the rightmost number in the cell and the text following it,
    then converts the numeric value as needed and returns a standardized string.
    Returns None if the cell's unit indicates a non-standard food item (thus marking the row for removal).
    """
    if not isinstance(cell, str) or cell.strip() == "":
        return cell
    # Find all occurrences of a number (with optional decimal)
    matches = list(re.finditer(r'\d+(?:\.\d+)?', cell))
    if not matches:
        return cell
    last_match = matches[-1]
    try:
        number = float(last_match.group())
    except:
        return cell
    # Extract everything to the right of the rightmost number
    unit_raw = cell[last_match.end():].strip().lower()
    # Remove extra dashes and double spaces
    unit = unit_raw.replace('-', '').replace('  ', ' ')
    
    # Mapping for conversion/standardization:
    # Volume conversions:
    if unit in ['tbsp.', 'tbsp']:
        converted = number * 14.79
        return f"{converted:.2f} ml"
    elif unit in ['fl oz']:
        converted = number * 29.57
        return f"{converted:.2f} ml"
    
    # Weight conversions:
    elif unit in ['mg', 'mgmg']:
        converted = number * 0.001  # mg to g
        return f"{converted:.2f} g"
    elif unit == 'mcg':
        converted = number * 1e-6  # mcg to g
        return f"{converted:.6f} g"
    elif unit in ['ounce', 'oz']:
        converted = number * 28.35  # oz to g
        return f"{converted:.2f} g"
    # "g per" is assumed to be just grams.
    elif unit in ['g per', 'gper']:
        return f"{number:.2f} g"
    # Standard weight units
    elif unit in ['gg', 'g about', 'grams', 'g', 'gadded sugar']:
        return f"{number:.2f} g"
    # Standard volume units
    elif unit in ['ml', 'mL']:
        return f"{number:.2f} ml"
    
    # Non-standard food items or descriptors: mark for removal.
    elif unit in ['cookies', 'chips', 'pieces', 'burger', 'iu', 'inch cube', 'inch cube)']:
        # Mark for removal by returning None.
        return None
    
    # If the unit contains 'g' or 'ml' somewhere, attempt to standardize:
    elif 'g' in unit:
        return f"{number:.2f} g"
    elif 'ml' in unit:
        return f"{number:.2f} ml"
    
    # If we cannot determine the unit, return the original cell.
    else:
        return cell

# Iterate over all rows for columns from the 4th column onward and standardize the units.
rows_to_drop = set()
for idx, row in df.iterrows():
    for col in df.columns[3:]:
        cell_val = row[col]
        if isinstance(cell_val, str):
            standardized = standardize_cell(cell_val)
            if standardized is None:
                rows_to_drop.add(idx)
                break  # No need to process further cells in this row.
            else:
                df.at[idx, col] = standardized

# Drop rows marked for removal.
if rows_to_drop:
    df = df.drop(index=rows_to_drop)

# Impute 0.0 for "Vitamin D", "Calcium", "Iron", "Potassium" where blank.
for col in ["Vitamin D", "Calcium", "Iron", "Potassium"]:
    if col in df.columns:
        # Replace blank strings and NaNs with 0.0
        df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
        df[col] = df[col].fillna("0.0 g")

# Remove any rows that have at least one blank value in any column.
# First, treat any cell that is an empty string as NaN.
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna()

# Save the fully processed DataFrame to a new CSV file.
output_file = 'NEWTJBase2.csv'
df.to_csv(output_file, index=False)
print(f"Processed file saved as {output_file}")


Processed file saved as NEWTJBase2.csv


In [6]:
import pandas as pd
import re

# Load the CSV file.
df = pd.read_csv("NEWTJBase2.csv")

# Delete the "calculated_calories" column if it exists.
if "calculated_calories" in df.columns:
    df.drop(columns=["calculated_calories"], inplace=True)

# Remove the "$" symbol from the "price" column.
if "price" in df.columns:
    # Convert to string (if necessary) and remove the "$" character.
    df["price"] = df["price"].astype(str).str.replace("$", "", regex=False)

# Remove all letters from cells in every column except the "name" column.
for col in df.columns:
    if col.lower() != "name":  # Skip the 'name' column
        df[col] = df[col].apply(lambda x: re.sub(r'[A-Za-z]', '', str(x)) if pd.notna(x) else x)

# Save the cleaned DataFrame to a new CSV file.
output_file = "NEWTJBase3.csv"
df.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")


Cleaned data saved to NEWTJBase3.csv


In [9]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("NEWTJBase3.csv")

# Update the price for the item "Chocolatey Dipping Kit" to 3.99
df.loc[df['name'] == "Four Cheese Scalloped Potatoes", 'serving_size'] = 138

# Save the updated DataFrame back to the CSV file
df.to_csv("NEWTJBase3.csv", index=False)


In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("NEWTJBase3.csv")

# Function to check and update serving_size
def check_update_serving_size(row):
    # Avoid division by zero
    if row['serves_about'] == 0:
        return row['serving_size']
    
    # Compute the expected serving size
    expected_serving = row['package_size'] / row['serves_about']
    
    # Check if the current serving_size deviates by more than 10%
    if abs(row['serving_size'] - expected_serving) / expected_serving > 0.1:
        return expected_serving
    else:
        return row['serving_size']

# Apply the function to each row to update serving_size where necessary
df['serving_size'] = df.apply(check_update_serving_size, axis=1)

# Save the updated DataFrame back to the CSV file
df.to_csv("NEWTJBase4.csv", index=False)


In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("NEWTJBase4.csv")

# Filter out rows where the price is under 0.1
df = df[df['price'] >= 0.1]

# Save the updated DataFrame back to the CSV file
df.to_csv("NEWTJBase4.csv", index=False)
