# Food: a relevant marker of social inequalities?

In [8]:
# Imports
from urllib.request import urlopen
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

import warnings
# Uncomment to hide Warnings
#warnings.filterwarnings('ignore')

import os
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *

## Study of OpenFoodFact

### Data cleaning

In [13]:
# Data
OFF_API_PATH = "https://world.openfoodfacts.org/{}.json"
OFF_API_QUERY_PATH = "https://world.openfoodfacts.org/cgi/search.pl?{}&page_size=1000&action=process&json=1"

def getOFFDataFrame(adress, path = OFF_API_PATH):
    """
    Queries OFF database with local adress contained in adress. Formatting url is contained in path.
    """
    result = json.load(urlopen(path.format(adress)))
    print("{} entries gathered".format(result["count"]))
    if "products" in result:
        return pd.DataFrame(result["products"])
    return pd.DataFrame(result["tags"])

def getOFFDataFrameDict(qdict):
    """
    Gathers OFF dataframe with query contained in dictionary qdict.
    """
    q = "&".join(["{}={}".format(key, value) for key, value in qdict.items()])
    return getOFFDataFrame(q, OFF_API_QUERY_PATH)

def getOFFDataFrameCat(tags=[], nutriments=[], **kwargs):
    """
    Queries the OFF data base with query informations contained in tags and nutriments. 
    <ul>
        <li>Tags should be a list of lists of shape: 
        [tag, value, &lt;conatins&gt;] where tag is the category of tag (ex. "brands") and value, the value to 
        be searched (ex. "Carrefour")</li>
        <li>Nutriments should be a list of lists of shape: [nutriment, comparison, value] where nutriment is the name of the nutriment to be 
        searched (ex. "magnesium"), comparison the nature of the comparison (ex. "lte") and 
        value the value to be compared with (ex. 50)</li>
    </ul>
    """
    qdict = {}
    for i, tagsEl in enumerate(tags):
        if len(tagsEl) == 2:
            contains = True
        else:
            contains = tagsEl[2]
        qdict["tagtype_{}".format(i)] = tagsEl[0]
        qdict["tag_contains_{}".format(i)] = ["does_not_contain","contains"][int(contains)]
        qdict["tag_{}".format(i)] = tagsEl[1]
    for i, nutrimentsEl in enumerate(nutriments):
        qdict["nutriment_{}".format(i)] = nutrimentsEl[0]
        qdict["nutriment_compare_{}".format(i)] = nutrimentsEl[1]
        qdict["nutriment_value_{}".format(i)] = nutrimentsEl[2]
        nutriment_i += 1
    qdict.update(kwargs)
    return getOFFDataFrameDict(qdict)


### Data formating

### Data analysis and visualization

## Study of INCA 2

### Data cleaning

In [7]:
# Data
import platform

# Comment/Uncomment the line depending on your computer
plat = platform.system()
print(plat)
if plat =="Windows":
    encoding = "ansi" # Windows
elif plat =="Darwin":
    encoding = "latin" # Mac
else:
    raise ValueError("unknown os: {}".format(plat))

DATA_PATH = "data/{}.csv"

dfConso = pd.read_csv(DATA_PATH.format("Table_conso"), sep=";", encoding=encoding)
dfIndiv = pd.read_csv(DATA_PATH.format("Table_indiv"), sep=";", encoding=encoding)
dfCapiCA = pd.read_csv(DATA_PATH.format("Table_capi_ca"), sep=";", encoding=encoding)
dfCarnetCA = pd.read_csv(DATA_PATH.format("Table_carnet_ca_1"), sep=";", encoding=encoding)
dfIndivCA = pd.read_csv(DATA_PATH.format("Table_indiv_ca"), sep=";", encoding=encoding)
dfIndnut = pd.read_csv(DATA_PATH.format("Table_indnut"), sep=";", encoding=encoding)
dfMenage = pd.read_csv(DATA_PATH.format("Table_menage_1"), sep=";", encoding=encoding)
dfRepas = pd.read_csv(DATA_PATH.format("Table_repas"), sep=";", encoding=encoding)
dfNomenclature = pd.read_csv(DATA_PATH.format("Nomenclature_3"), sep=";", encoding=encoding)
dfCorrespondance = pd.read_csv(DATA_PATH.format("correspondance_reponses"), sep=";", encoding=encoding)
dfDataNames = pd.read_csv(DATA_PATH.format("Data_names_all"), sep=";", encoding=encoding)

Windows


### Data formating

In [None]:
def findCorrespondance(val, column):
    """
    Returns the correspondance of the numerical val for the corresponding column.
    """
    try:
        meaning = dfDataCorrespondance[(dfDataCorrespondance["Nom de la variable"] == column) & (dfDataCorrespondance["code"] == val)]["Signification"].values[0]
    except IndexError:
        meaning = val
    return meaning

def mapCorrespondances(vals, column):
    """
    Returns the correspondance of the numerical val for the corresponding column.
    """
    return [findCorrespondance(val, column) for val in vals]

def findDescription(column):
    """
    Returns the description of the column signification.
    """
    return dfDataNames[(dfDataNames["Nom de la variable"] == column)]["Libellé de la variable"].values[0]

def mapDescription(columns):
    """
    Returns the description of the columns signification.
    """
    return [findDescription(column) for column in columns]

### Data analysis and visualization

#### BMI study

#### Interest in food

#### Displaying functions

In [9]:
def plotBar(table, x, y, index=None):
    """
    Plots a stacked normalized bar plot from table. Axis x is x, axis y is y, count is column index.
    """
    if index == None: index = y
    consos = table[[x, y, index]].groupby([x, y]).count().unstack(level=1)[index]
    consos = consos.swapaxes(0,1)
    consos = consos / consos.sum(axis=0)
    consos = consos.swapaxes(0,1)
    consos.index = mapCorrespondances(consos.index, x)
    consos.columns = mapCorrespondances(consos.columns, y)
    consos.plot.bar(stacked=True)
    plt.show()

def plotScatter(table, col1, col2):
    """
    Scatters columns col1 and columns col2 of table
    """
    data = table[[col1, col2]].values
    plt.scatter(data[:,0], data[:,1])
    plt.show()
    
def getBinsFunc(x, nbins):
    """
    Create a function mapping x values into a categorical 
    """
    xmin = np.min(x)
    xmax = np.max(x) * 1.01
    return lambda v: np.int16(nbins * (v - xmin) / (xmax - xmin)) * (xmax - xmin) + xmin
    
def plotScatterCateX(table, x, y, index, nbins):
    """
    Plots a stacked normalized bar plot from table. Axis x is x, axis y is y, count is column index.
    Column index is filtered into nbins bins.
    """
    consos = table[[x, y, index]]
    f = getBinsFunc(consos[y], nbins)
    consos[y] = f(consos[y])
    plotBar(consos, x, y, index)

def plotMeanValues(table, features, className, mincount=50):
    """
    Plot a normalized stacked bar chart of mean values of each features in features for each class according to 
    class name. Consider only classes with at least mincount elements.
    """
    featuresNames = mapDescription(features)

    means = table.groupby(className).filter(lambda x: len(x) > mincount).groupby(className).mean()[features]
    means.columns = featuresNames
    means /= means.sum(axis=0)
    means = means.swapaxes(0, 1)
    means.columns = mapCorrespondances(means.columns, className)
    means.plot.bar(stacked=True, figsize=figsize)
    plt.show()