In [None]:
#Packages Used
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import readability

## Import Datasets

Datasets Used:

- OpenFoodFacts Product Dataset (2,168,141 rows x 191 columns)

- FoodDataCentral Branded Food Dataset (1,555,131 rows x 17 columns)

At the moment, empty product names are being filled in as "Unknown" to confirm whether further analysis is possible. Future lookup, either via website or database parsing, is required to verify identity.

- OpenFoodFacts Product Dataset (subset where country = United states) - 412,108 rows x 191 columns

In [None]:
OpenFoodFacts_Demo = pd.read_csv("OpenFoodFacts_Head_Demo.csv")
OpenFoodFacts_Demo.fillna({'product_name':"Unknown"}, inplace=True)
for i in OpenFoodFacts_Demo.columns.values:
    print(i)

In [None]:
print(OpenFoodFacts_Demo["brands"])

In [None]:
FoodDataCentral_Demo = pd.read_csv("FoodDataCentral_Head_Demo.csv")
print(FoodDataCentral_Demo)

In [None]:
#Get Readability Scores- Open Food Facts
readability_scores = []
for index, row in OpenFoodFacts_Demo.iterrows():
    if pd.isna(row["ingredients_text"]) or row["ingredients_text"] in ['🍎', "..."]:
        curr_record = (row['product_name'], row['code'], pd.NA,pd.NA,pd.NA)
        readability_scores.append(curr_record)
        print([i for i in curr_record])
    else:
        """Readability.getmeasures() automatically tokenizes the input by default and returns a set of readability measures. In this case,
        we are getting a specific measure from the set (flesch-kincaid) """
        curr_record = (row['product_name'], row['code'], readability.getmeasures(row["ingredients_text"])['readability grades']['Kincaid'], 
                       readability.getmeasures(row["ingredients_text"])['readability grades']['FleschReadingEase'],
                       readability.getmeasures(row["ingredients_text"])['sentence info']['words'])
        readability_scores.append(curr_record)
        print([i for i in curr_record])


readScores_OFF = pd.DataFrame(data = readability_scores, columns = ["code", "product_name", "Kincaid_Score","FleschReadingEase","num_words"])
readScores_OFF.to_csv("OpenFoodFacts_Readability.csv", sep=",")
OpenFoodFacts_Demo['Flesch-Kincaid'] = readScores_OFF['Kincaid_Score']
OpenFoodFacts_Demo['FleschReadingEase'] = readScores_OFF['FleschReadingEase']
OpenFoodFacts_Demo['num_words'] = readScores_OFF['num_words']
print(OpenFoodFacts_Demo)

In [None]:
print(FoodDataCentral_Demo['ingredients'][2])

In [None]:
print(OpenFoodFacts_Demo['ingredients_text'][1])

In [None]:
#Readability scores example

readability.getmeasures(FoodDataCentral_Demo.loc[1]['ingredients'])

In [None]:
#Get Readability Scores- FoodData Central
readability_scores = []
for index, row in FoodDataCentral_Demo.iterrows():
    num_words = len(word_tokenize(row['ingredients']))
    if pd.isna(row["ingredients"]) or row["ingredients"] in ["---"]:
        curr_record = (row['fdc_id'], row['gtin_upc'], pd.NA,pd.NA)
        readability_scores.append(curr_record)
        
    else:
        print(row['ingredients'])
        """Readability.getmeasures() automatically tokenizes the input by default and returns a set of readability measures. In this case,
        we are getting a specific measure from the set (flesch-kincaid) """
        curr_record = (row['fdc_id'], row['gtin_upc'], row['branded_food_category'],readability.getmeasures(row["ingredients"])['readability grades']['Kincaid'],
                       readability.getmeasures(row["ingredients"])['readability grades']['FleschReadingEase'],
                       readability.getmeasures(row["ingredients"])['sentence info']['words'])
        readability_scores.append(curr_record)
        

readScores_FDC = pd.DataFrame(data = readability_scores, columns = ["fdc_id", "gtin_upc","branded_food_category","Kincaid_Score","FleschReadingEase","num_words"])
print(readScores_FDC)
readScores_FDC.to_csv("FoodData_Central_Readability.csv", sep=",")
FoodDataCentral_Demo['Flesch-Kincaid'] = readScores_FDC['Kincaid_Score']
FoodDataCentral_Demo['FleschReadingEase'] = readScores_FDC['FleschReadingEase']
FoodDataCentral_Demo['num_words'] = readScores_FDC['num_words']

In [None]:
#Plot Mean Readability by Category
df = readScores_FDC.groupby(by="branded_food_category").mean()
print(df)
plt.scatter(df["num_words"], df['FleschReadingEase'])
plt.show()

In [None]:
#Plot by Brand
df = FoodDataCentral_Demo.groupby(by="brand_owner").mean()
print(df['FleschReadingEase'])
plt.scatter(df["num_words"], df['FleschReadingEase'])
plt.show()

In [None]:
series = pd.notnull(OpenFoodFacts_Demo['brands'])
df = OpenFoodFacts_Demo[series]

df = df.groupby(by = 'brands').mean()
print(df)
plt.scatter(df["num_words"], df['FleschReadingEase'])
plt.show()