# Tests involving parsing OpenFoodFacts and FoodData Central Databases

Databases Used:

OpenFoodFacts Product Database (2168141 rows x 191 columns)

FoodDataCentral Branded Food Database (1555131 rows x 17 columns)

In [None]:
#Packages Used
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import readability

In [None]:
OpenFoodFacts = pd.read_csv("OpenFoodFacts/en.openfoodfacts.org.products.csv", sep = "\t")
print(OpenFoodFacts)

In [None]:
FoodDataCentral = pd.read_csv("FoodData_Central_csv_2021-10-28/branded_food.csv")
print(FoodDataCentral)

In [None]:
OFF_head = OpenFoodFacts.head(500)


In [None]:
OFF_head.to_csv("OpenFoodFacts_Head_Demo.csv")

In [None]:
FDC_head = FoodDataCentral.head(500)

In [None]:
FDC_head.to_csv("FoodDataCentral_Head_Demo.csv")

In [None]:
OFF_US = OpenFoodFacts.query('countries_en == "United States"')
print(OFF_US)

In [None]:
OFF_US_head = OFF_US.head(500)
OFF_US_head.to_csv("OpenFoodFacts_Head_Demo.csv")

In [None]:
OFF_US.fillna({'product_name':"Unknown"}, inplace=True)

In [None]:
#Get Readability Score
for i in OFF_head['ingredients_text']:
    #Check if NA/NaN
    if not pd.isna(i):
        print(readability.getmeasures(i)['readability grades']['Kincaid'])


In [None]:
#Get Readability Score
readability_scores = []
for index, row in OFF_head.iterrows():
    if pd.isna(row["ingredients_text"]) or row["ingredients_text"] in ['🍎', "..."]:
        curr_record = (row['product_name'], row['code'], pd.NA)
        readability_scores.append(curr_record)
        print([i for i in curr_record])
    else:
        curr_record = (row['product_name'], row['code'], readability.getmeasures(row["ingredients_text"])['readability grades']['Kincaid'])
        readability_scores.append(curr_record)
        print([i for i in curr_record])

readScores_df = pd.DataFrame(data = readability_scores, columns = ["code", "product_name", "Kincaid_Score"])
print(readScores_df)
    

In [None]:
#Get Readability Score- Open Food Facts (United States Foods)
readability_scores = []
for index, row in OFF_US.iterrows():
    if pd.isna(row["ingredients_text"]) or row["ingredients_text"] in ['🍎', "...", "."]:
        curr_record = (row['code'], row['product_name'], pd.NA)
        readability_scores.append(curr_record)
        
    else:
        print(row["ingredients_text"])
        curr_record = (row['code'], row['product_name'],readability.getmeasures(row["ingredients_text"])['readability grades']['Kincaid'])
        readability_scores.append(curr_record)

readScores_df = pd.DataFrame(data = readability_scores, columns = ["code", "product_name", "Kincaid_Score"])
print(readScores_df)
readScores_df.to_csv("OpenFoodFacts_Readability.csv", sep=",")

In [None]:
readScores_df.to_csv("OpenFoodFacts_Readability.csv", sep=",")

In [None]:
OFF_US.query("ingredients_text == '🍎'")

In [None]:
FoodDataCentral.query("ingredients == '---'")

In [None]:
print(OFF_US['code'].isnull().values.any())
print(OFF_US['code'].isnull().sum())

In [None]:
print(FDC_head)

In [None]:
print(OFF_US.columns.values)

In [None]:
#Get Readability Scores- FoodData Central
readability_scores = []
for index, row in FoodDataCentral.iterrows():
    if pd.isna(row["ingredients"]) or row["ingredients"] in ["---"]:
        curr_record = (row['fdc_id'], row['gtin_upc'], pd.NA)
        readability_scores.append(curr_record)
        
    else:
        print(row['ingredients'])
        curr_record = (row['fdc_id'], row['gtin_upc'], readability.getmeasures(row["ingredients"])['readability grades']['Kincaid'])
        readability_scores.append(curr_record)
        

readScores_df = pd.DataFrame(data = readability_scores, columns = ["fdc_id", "gtin_upc", "Kincaid_Score"])
print(readScores_df)
readScores_df.to_csv("FoodData_Central_Readability.csv", sep=",")

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 10.00]
plt.rcParams["figure.autolayout"] = True
G = nx.from_pandas_edgelist(df, 'from', 'to')
nx.draw(G, with_labels=True, node_size=100, alpha=1, linewidths=10)
plt.show()

In [None]:
readScores_df.to_csv("FoodData_Central_Readability.csv", sep=",")

# **Testing Notes**

## 2-16-2022
- Both OpenFoodFacts and FoodData Central Databases contain entries with 'non-standard' ingredient lists (i.e. emojis or no words)- Further investigation of datasets is required.

## 2-18-2022
- OpenFoodFacts "non word" Ingredient list records include the following: '🍎', "...", "." Getting readability scores is successful after factoring for these values. Substitution of these values is needed for inclusion in analysis.

- FoodData Central Data too large to be handled with current memory. Further investigation of ingredient lists is needed.

## 2-20-2022
- Refinement of Tokenization is needed.
- Negative Flesch-Kincaid scores

## 2-24-2022
