In [None]:
#Packages Used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import readability
import time
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import itertools
from itertools import combinations,chain,product,permutations

In [None]:
#Import Food Data Central Food Products Database (500 record demo for now)
FoodDataCentral = pd.read_csv("branded_food.csv", low_memory = False)

In [None]:
#Generate list of branded food categories (Optional, used to choose branded food category for analysis)
grouped_counts = pd.DataFrame(FoodDataCentral.groupby(['branded_food_category'])['branded_food_category'].count())
grouped_counts.columns = ["bfc_count"]
#print(grouped_counts.sort_values(by = "bfc_count", ascending = False)[40:60])

In [None]:
#Get Database subset by branded food category ('Rice' in this case) and remove rows with empty values or non-word values 
FoodDataCentral = FoodDataCentral.query("branded_food_category == 'Rice'")
FoodDataCentral = FoodDataCentral.dropna(subset = ['ingredients'])
FoodDataCentral = FoodDataCentral[FoodDataCentral["ingredients"] != "---"]
FoodDataCentral = FoodDataCentral[FoodDataCentral["ingredients"] != ","]

In [None]:
#Get Readability Scores- FoodData Central
readability_scores = []
for index, row in FoodDataCentral.iterrows():
    #num_words = len(word_tokenize(row['ingredients']))
    if pd.isna(row["ingredients"]) or row["ingredients"] in ["---"]:
        curr_record = (row['fdc_id'], row['gtin_upc'], pd.NA,pd.NA)
        readability_scores.append(curr_record)
        
    else:
        """Readability.getmeasures() automatically tokenizes the input by default and returns a set of readability measures. In this case,
        we are getting a specific measure from the set (flesch-kincaid) """
        curr_record = (row['fdc_id'], row['gtin_upc'], row['branded_food_category'],readability.getmeasures(row["ingredients"])['readability grades']['Kincaid'],
                       readability.getmeasures(row["ingredients"])['readability grades']['FleschReadingEase'],
                       readability.getmeasures(row["ingredients"])['readability grades']['DaleChallIndex'],
                       readability.getmeasures(row["ingredients"])['sentence info']['words'],
                       readability.getmeasures(row["ingredients"])['sentence info']['complex_words_dc'])
        readability_scores.append(curr_record)
        

readScores_FDC = pd.DataFrame(data = readability_scores, columns = ["fdc_id", "gtin_upc","branded_food_category",
                                                                    "Kincaid_Score","FleschReadingEase","DaleChallIndex",
                                                                    "num_words","complex_words_dc"])

readScores_FDC.to_csv("FoodData_Central_Readability.csv", sep=",")
for value in readScores_FDC.columns.values:
    FoodDataCentral[value] = readScores_FDC[value]
    

In [None]:
#Create Difference Matrix- Flesch
difference_matrix_fl = [[y - x for x in FoodDataCentral["FleschReadingEase"]] for y in FoodDataCentral["FleschReadingEase"]]
difference_matrix_fl = list(chain(*difference_matrix_fl))

In [None]:
#Create Difference Matrix- Dale-Chall
difference_matrix_dc = [[y - x for x in FoodDataCentral["DaleChallIndex"]] for y in FoodDataCentral["DaleChallIndex"]]
difference_matrix_dc = list(chain(*difference_matrix_dc))

In [None]:
#Convert FDC IDs from float to string
FoodDataCentral["fdc_id"] = FoodDataCentral["fdc_id"].astype("str")

In [None]:
#Get FDC indices for referencing matrices
fdc_indices = dict(enumerate(FoodDataCentral["fdc_id"]))

In [None]:
#Create list of FDC ID pairs (w/repeats)
fdcID_pairs = list(itertools.product(FoodDataCentral["fdc_id"],repeat=2))

In [None]:
#Gather list of ingredient lists
documents = list(FoodDataCentral['ingredients'].values) 

#Create Model
count_vectorizer = CountVectorizer(documents, stop_words='english')

#Fit model to ingredient list
count_vectorizer.fit(documents)

In [None]:
#Transform model to array
documents_1 = list(FoodDataCentral['ingredients'].values) 
vectors = count_vectorizer.transform(documents_1).toarray()
np.save("Vectors_Batch_3_test",vectors)

In [None]:
#Calculate cosine similarity
cos_sim = cosine_similarity(vectors)
cos_sim_flat = list(cos_sim)
cos_sim_flat = list(chain(*cos_sim_flat))
cos_sim_flat = cos_sim_flat / np.linalg.norm(cos_sim_flat)

In [None]:
#Create Dataframe with readability differences and cosine similarities
a = pd.DataFrame(fdcID_pairs, columns = ["fdc_id 1","FDC_id 2"])
a["DaleChallDiff"] = difference_matrix_dc
a["Cosine_similarity"] = cos_sim_flat
a["Flesch_diff"] = difference_matrix_fl

In [None]:
#Data Preparation
a.dropna(inplace=True)
a.replace([np.inf, -np.inf], np.nan, inplace=True)
a.dropna(inplace=True)
a.to_csv("Readability_and_CosineSimilarity_scores.csv")

In [None]:
#Plot Association- Dale Chall diff vs Cosine Similarity
plt.scatter(a["DaleChallDiff"], a["Cosine_similarity"])
plt.show()
plt.savefig("DaleChall_vs_CosineSimilarity.png")

In [None]:
#Scatter Plot- Dale Chall diff vs Flesch diff
plt.scatter(a["DaleChallDiff"], a["Flesch_diff"])
plt.show()
plt.savefig("DaleChall_vs_Flesch.png")

In [None]:
#Scatter Plot- Flesch diff vs Cosine Similarity
plt.scatter(a["Flesch_diff"], a["Cosine_similarity"])
plt.show()
plt.savefig("Flesch_vs_CosineSimilarity.png")

In [None]:
#Get Pearson Correlations
print(pearsonr(a["DaleChallDiff"], a["Cosine_similarity"]))
print(pearsonr(a["Flesch_diff"], a["Cosine_similarity"]))
print(pearsonr(a["DaleChallDiff"], a["Flesch_diff"]))