In [None]:
#Packages Used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import readability
import time
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import itertools
from itertools import combinations,chain,product,permutations

In [None]:
# Scatter plots using random sample approximating behavior (due to millions of points) 

for i in [["Cream", "yellow"],["Pasta-Dinners", "red"],["Rice", "blue"],["Random","green"]]:
    df = df = pd.read_csv("".join((i[0],"_Readability_and_CosineSimilarity_scores.csv")))
    analysis_set = df.sample(1000, random_state=11)
    delim_subset = i[0]
    
    #Get Scatter Plot- Dale Chall diff vs Cosine Similarity
    plt.scatter(analysis_set["DaleChallDiff"], analysis_set["Cosine_similarity"], c = i[1])
    plt.xlabel("Dale-Chall Index Pairwise difference")
    plt.ylabel("Cosine Similarity")
    plt.title("Difference In Dale-Chall Index vs Cosine Similarity")
    plt.savefig("".join((delim_subset,"_DaleChall_vs_CosineSimilarity.png")))
    plt.show()
    

    #Get Scatter Plot- Dale Chall diff vs Flesch diff
    plt.scatter(analysis_set["DaleChallDiff"], analysis_set["Flesch_diff"], c = i[1])
    plt.xlabel("Dale-Chall Pairwise difference")
    plt.ylabel("Flesch Pairwise Difference")
    plt.title("Difference In Dale-Chall Index vs Difference in Flesch Reading Ease")
    plt.savefig("".join((delim_subset,"_DaleChall_vs_Flesch.png")))
    plt.show()
    

    #Get Scatter Plot- Flesch diff vs Cosine Similarity
    plt.scatter(analysis_set["Flesch_diff"], analysis_set["Cosine_similarity"], c = i[1])
    plt.xlabel("Flesch Pairwise difference")
    plt.ylabel("Cosine Similarity")
    plt.title("Difference In Flesch Reading Ease vs Cosine Similarity")
    plt.savefig("".join((delim_subset,"_Flesch_vs_CosineSimilarity.png")))
    plt.show()


In [None]:
#Summary Statistics- Readability
dataframes = [] 

for i in ["Rice", "Cream", "Pasta-Dinners","Random"]:
    df = pd.read_csv("".join((i,"_FoodData_Central_Readability.csv")))
    df["Subset"] = i
    dataframes.append(df)
print(dataframes)




In [None]:
#Summary Statistics- Pearson Correlations
import pandas as pd
from scipy.stats import pearsonr

Pearson_list = []

for i in ["Rice", "Cream", "Pasta-Dinners","Random"]:
    df = pd.read_csv("".join((i,"_Readability_and_CosineSimilarity_scores.csv")))
    DC_CS_pearson = pearsonr(df["DaleChallDiff"], df["Cosine_similarity"])
    FL_CS_pearson = pearsonr(df["Flesch_diff"], df["Cosine_similarity"])
    DC_FL_pearson = pearsonr(df["DaleChallDiff"], df["Flesch_diff"])
    Pearson_corrs = [i,DC_CS_pearson, FL_CS_pearson, DC_FL_pearson]
    Pearson_list.append(Pearson_corrs)
Pearson_df = pd.DataFrame(data = Pearson_list, columns = ["subset", "Pearson_DC_CS", "Pearson_FL_CS", "Pearson_DC_FL"])
Pearson_df.to_csv("Pearson_stats.csv")
print(Pearson_df)


In [None]:
combined_df = pd.concat([i for i in dataframes], ignore_index = True, sort = False)
print(len(combined_df))

In [None]:
summary_stats = combined_df.groupby("Subset")["DaleChallIndex"].describe()
print(summary_stats)
summary_stats.to_csv("DaleChall_summarystats.csv")

summary_stats = combined_df.groupby("Subset")["FleschReadingEase"].describe()
print(summary_stats)
summary_stats.to_csv("Flesch_summarystats.csv")

# Readability Distance and Cosine Similarity Summary Stats

In [None]:
#Summary Statistics- Distance and Similarity
dataframes = [] 

for i in ["Rice", "Cream", "Pasta-Dinners","Random"]:
    df = pd.read_csv("".join((i,"_Readability_and_CosineSimilarity_scores.csv")))
    df["Subset"] = i
    dataframes.append(df)
print(dataframes)


In [None]:
combined_df = pd.concat([i for i in dataframes], ignore_index = True, sort = False)
print(len(combined_df))

In [None]:
summary_stats = combined_df.groupby("Subset")["DaleChallDiff"].describe()
print(summary_stats)
summary_stats.to_csv("DaleChallDiff_summarystats.csv")

summary_stats = combined_df.groupby("Subset")["Flesch_diff"].describe()
print(summary_stats)
summary_stats.to_csv("FleschDiff_summarystats.csv")

summary_stats = combined_df.groupby("Subset")["Cosine_similarity"].describe()
print(summary_stats)
summary_stats.to_csv("CosineSimilarity_summarystats.csv")

In [None]:
#Z-test- Random sample
from statsmodels.stats.weightstats import ztest as ztest
import pandas as pd
import sys

#Import Food Data Central Food Products Database 
#dataset = pd.read_csv("FoodData_Central_Readability.csv", low_memory = False)
dataset = pd.read_csv("branded_food.csv", low_memory = False)
print(dataset.shape)
data_random = pd.read_csv("Random_FoodData_Central_Readability.csv", low_memory = False)
print(len(dataset))

zTest_data = data_random["FleschReadingEase"].values.tolist()

mean = dataset["FleschReadingEase"].mean()
mean2 = data_random["FleschReadingEase"].mean()

print(mean)
print(mean2)

print("Flesch Reading Ease Z Test")
print(ztest(zTest_data, value=mean))
print()

zTest_data = data_random["DaleChallIndex"].values.tolist()

mean = dataset["DaleChallIndex"].mean()
mean2 = data_random["DaleChallIndex"].mean()

print(mean)
print(mean2)

print("Dale Chall Index Z Test")
print(ztest(zTest_data, value=mean))