In [1]:
import polars as pl
import json

In [2]:
def longest_common_subsequence(str1, str2):
    m, n = len(str1), len(str2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]

def find_largest_common_subsequence(string, string_list):
    """
    Given a string and a list of strings, returns the string
    in the list with the largest common subsequence with the input string.

    Args:
    - string (str): The input string.
    - string_list (list): List of strings to find the largest common subsequence.

    Returns:
    - str or None: The string with the largest common subsequence, or None if the list is empty.
    """
    max_lcs_length = 0
    result_string = None

    for candidate_string in string_list:
        lcs_length = longest_common_subsequence(string, candidate_string)
        if lcs_length > max_lcs_length:
            max_lcs_length = lcs_length
            result_string = candidate_string

    return result_string

In [14]:
data = pl.scan_csv('C:/Users/aurel/Downloads/HS23/CIR/Automatic-Cart/en.openfoodfacts.org.products.csv', separator='\t')
with open('C:/Users/aurel/Downloads/HS23/CIR/Automatic-Cart/ImageToGraph/supermarket_items.json') as f:
    items = json.load(f)

#Create a dictionary with {items:categories}
items_list =  []
for i,product in enumerate(items.values()):
    if i != 6: #Skip the tools
        items_list.extend(product)

transformed_dict = {}
for category, items in items.items():
    for item in items:
        transformed_dict[item] = category

#Select only relevant columns
to_keep = ['product_name', 'nutriscore_score', 'image_url']#, 'energy-kcal_100g', 'fat_100g','carbohydrates_100g','proteins_100g']
data = data.select(to_keep)

#Check if the product name string is contained in the items list
data = data.filter(pl.col('product_name').str.contains('|'.join(items_list)))

#Filter by non_null columns
required_columns = ["nutriscore_score", "image_url"]
data = data.filter(*[pl.col(col).is_not_null() for col in required_columns])

#Rename product names according to what we have already
data = data.with_columns(
    (
        pl.col('product_name').map_elements(
            lambda x: find_largest_common_subsequence(x, items_list)
        )
    ).alias('product_name')
)
#Add category column
data = data.with_columns(pl.col('product_name').map_dict(transformed_dict).alias('category'))
data = data.unique('product_name')
data = data.collect()
data.write_csv('Food_Data.csv')


In [31]:
import pandas as pd 

df = pd.read_csv('Food_Data.csv')

def convert_to_letter_scale(row):
    value = row['nutriscore_score']
    if row['category'] == 'Drinks':
        if value <= -1:
            return 'A'
        elif value <= 2:
            return 'B'
        elif value <= 10:
            return 'C'
        elif value <= 18:
            return 'D'
        else:
            return 'E' 
    else:
        if value <= -0:
            return 'A'
        elif value <= 1:
            return 'B'
        elif value <= 5:
            return 'C'
        elif value <= 9:
            return 'D'
        else:
            return 'E' 

# Apply the conversion function to each column
df['nutriscore_score'] = df.apply(convert_to_letter_scale, axis=1)

result_dict = df.groupby('product_name').agg(lambda x: x.tolist()).to_dict(orient='index')
with open('food_info.json', 'w') as json_file:
    json.dump(result_dict, json_file, indent=2)