In [1]:
import pandas as pd
from pandas import json_normalize

import requests 
import json
import os
from dotenv import load_dotenv 
from bs4 import BeautifulSoup
import re
import time

# pd.options.plotting.backend = "plotly"
import plotly.express as px 
import plotly.graph_objects as go

load_dotenv()

True

In [5]:
import os
key_api = os.environ.get("key")
host_api = os.environ.get("host")



# Web scraping for minimum wage in Turkey throughout the years


In [2]:
# The objective is to get a table showing the minimum wage over the years in Turkey.

url = 'https://countryeconomy.com/national-minimum-wage/turkey'
res = requests.get(url)
html = res.content
soup = BeautifulSoup(html, "html.parser")

In [3]:
# Targetting the date column.

y = soup.select("td.fecha")
y[0].getText().strip()

year = [i.getText() for i in y]
year[:3]

['June 2022', 'January 2022', 'January 2021']

In [4]:
# Targetting the wage column.

mw = soup.find_all("td", attrs={"class":"numero"})
mw[0].getText().strip()

min_wage = [i.getText() for i in mw]
min_wage = [i for i in min_wage if '$' not in i and '€' not in i]

min_wage[:3]

['6,471.0', '5,004.0', '3,577.5']

In [5]:
# Targetting the wage column (in US dollars).

mwu = soup.select("td.numero.dol")
mwu[0].getText().strip()

min_wage_usd = [i.getText() for i in mwu]
min_wage_usd[:3]

['$394.5', '$375.1', '$404.2']

In [6]:
# Cleaning the dataframe and transforming the wage columns into float.

turkey = {
    'year': year,
    'min_wage': min_wage,
    'min_wage_dollar': min_wage_usd
}

turkey_wages = pd.DataFrame(turkey)
turkey_wages = turkey_wages[turkey_wages['year'].str.contains('June')==False]


turkey_wages = turkey_wages.replace('January ', "", regex = True).replace(',', '', regex = True).replace('\$', '', regex=True)
turkey_wages = turkey_wages.astype({'min_wage' : float, 'min_wage_dollar' : float, 'year': int})

turkey_wages.sample(3)


Unnamed: 0,year,min_wage,min_wage_dollar
17,2011,796.5,499.7
11,2014,1071.0,518.2
15,2012,886.5,523.7


# Cleaning the price variation dataset and merging the two dataframes

In [7]:
# Reading the dataset used for this anylisis.
# "global_prices" is a dataset displaying the price variation for different products in 76 countries from 2002 to 2021.

global_prices = pd.read_csv(".\Data\global_food_prices.csv", encoding='unicode_escape')
global_prices.sample(5)

  global_prices = pd.read_csv(".\Data\global_food_prices.csv", encoding='unicode_escape')


Unnamed: 0,adm0_id,adm0_name,adm1_id,adm1_name,mkt_id,mkt_name,cm_id,cm_name,cur_id,cur_name,pt_id,pt_name,um_id,um_name,mp_month,mp_year,mp_price,mp_commoditysource
1000147,145.0,Libya,1843,,2327,Zliten,112,Pasta - Retail,0.0,LYD,15,Retail,28,500 G,4,2018,1.3,
1230374,175.0,Nepal,2155,Mid Western,650,Jumla,60,Rice (coarse) - Retail,0.0,NPR,15,Retail,5,KG,2,2005,45.0,
1101958,155.0,Mali,1932,Segou,900,San,71,Rice (local) - Retail,0.0,XOF,15,Retail,5,KG,4,2009,335.38,
454141,66.0,Cote d'Ivoire,16838,Yamoussoukro,834,Adjame,70,"Rice (denikassia, imported) - Retail",0.0,XOF,15,Retail,5,KG,7,2012,350.0,
1540628,205.0,Rwanda,21972,South/Amajyepfo,1072,Kayenzi,453,Livestock (hen) - Retail,0.0,RWF,15,Retail,33,Unit,5,2014,4091.0,


In [8]:
# Creating a subset of the price variation that only takes into consideration Turkey.

prices_turkey = global_prices.loc[global_prices['adm0_name'] == "Turkey"]


In [9]:
# Since this analysis is focusing on the consumer, the prices for wholesale are not relevant.

prices_turkey = prices_turkey[prices_turkey['pt_name'].str.contains('Wholesale')==False]

prices_turkey = prices_turkey.replace(' - Retail', "", regex=True)


In [10]:

# Cleaning the table by dropping columns that are not relevant for the analysis and renaming the other ones 
# for readability and clarity.

prices_turkey.drop(['cur_id', 'cur_name', 'pt_id', 'um_id', 'mp_commoditysource', 'adm1_name'], axis = 'columns', inplace=True)

prices_turkey.drop(['cm_id', 'adm0_id', 'adm1_id', 'mkt_id', 'mkt_name', 'pt_name'], axis = 'columns', inplace=True)

prices_turkey.columns = prices_turkey.columns.str.replace('mp_', "")

prices_turkey.sample(5)



Unnamed: 0,adm0_name,cm_name,um_name,month,year,price
1873338,Turkey,Bananas,KG,4,2017,8.6899
1876271,Turkey,Fish (fresh),KG,4,2018,20.8064
1875021,Turkey,Pasta,KG,10,2018,3.8346
1870455,Turkey,Bread (common),KG,11,2017,3.7938
1874194,Turkey,Zucchini,KG,2,2017,6.0664


In [11]:
# Merging both dataframes (min. wage variation and food price variation) by the year and creating a new one called products_wage_turkey that has information
# about the price variation of certain goods as well as the variation of the minimum wage in Turkey.

products_wage_turkey = prices_turkey.merge(turkey_wages, how='inner', on='year')
products_wage_turkey.sample(3)


Unnamed: 0,adm0_name,cm_name,um_name,month,year,price,min_wage,min_wage_dollar
3080,Turkey,Zucchini,KG,12,2017,3.8087,1777.5,487.2
8633,Turkey,Salt,KG,12,2020,3.3967,2943.0,419.9
5757,Turkey,Tea (herbal),Package,6,2019,5.3696,2558.4,450.9


In [12]:
# More cleaning for readability (renaming columns and rounding values).

products_wage_turkey.rename(
    columns =
    {'cm_name':'product', 'adm0_name':'country','um_name':'unit'},
    inplace = True
)

products_wage_turkey['price'] = products_wage_turkey['price'].round(2)


products_wage_turkey.sample(2)


Unnamed: 0,country,product,unit,month,year,price,min_wage,min_wage_dollar
9195,Turkey,Bananas,KG,2,2020,10.76,2943.0,419.9
4501,Turkey,Fuel (petrol-gasoline),L,3,2018,5.73,2029.5,420.3


In [13]:
# Filtering the products by price and just taking the highest price of each year (instead of a price per month)

products_wage_filtered = products_wage_turkey.loc[products_wage_turkey.groupby(['year', 'product'])['price'].idxmax()]
products_wage_filtered.sort_values(by = ['product', 'year']).sample(4)


Unnamed: 0,country,product,unit,month,year,price,min_wage,min_wage_dollar
7102,Turkey,Chickpeas,KG,1,2019,10.45,2558.4,450.9
569,Turkey,Garlic,KG,12,2015,15.33,1201.5,468.2
1511,Turkey,Cucumbers (greenhouse),KG,2,2017,4.57,1777.5,487.2
7287,Turkey,Eggplants,KG,3,2019,11.16,2558.4,450.9


In [14]:
products_wage_filtered['product'].unique()


array(['Apples (red)', 'Bananas', 'Beans (white)', 'Bread (common)',
       'Bulgur', 'Cabbage', 'Cauliflower', 'Chickpeas', 'Cocoa (powder)',
       'Coffee', 'Coffee (instant)', 'Cucumbers (greenhouse)', 'Eggs',
       'Fish (fresh)', 'Garlic', 'Groundnuts (shelled)', 'Lentils',
       'Meat (chicken)', 'Meat (mutton)', 'Meat (veal)',
       'Milk (pasteurized)', 'Oil (olive)', 'Oil (sunflower)', 'Onions',
       'Oranges', 'Pasta', 'Peas (green, dry)', 'Rice', 'Salt', 'Sugar',
       'Tea', 'Tea (green)', 'Tomatoes', 'Wheat flour', 'Bread (pita)',
       'Cheese', 'Eggplants', 'Fuel (gas)', 'Fuel (petrol-gasoline)',
       'Milk (powder, infant formula)', 'Potatoes', 'Spinach',
       'Tomatoes (paste)',
       'Wage (non-qualified labour, non-agricultural)', 'Yogurt',
       'Apples', 'Cucumbers', 'Electricity', 'Tea (herbal)',
       'Transport (public)', 'Water', 'Zucchini'], dtype=object)

In [15]:
# The food products are the focus of this analysis, so everything that is not food should be filtered out.

food_wage_filtered = products_wage_filtered[(products_wage_filtered['product'] != 'Wage (non-qualified labour, non-agricultural)') & 
                     (products_wage_filtered['product'] != 'Electricity') & (products_wage_filtered['product'] !='Fuel (gas)') &
                     (products_wage_filtered['product'] != 'Fuel (petrol-gasoline)') & (products_wage_filtered['product'] != 'Transport (public)')]
                                                                                        

food_wage_filtered['product'].unique()

array(['Apples (red)', 'Bananas', 'Beans (white)', 'Bread (common)',
       'Bulgur', 'Cabbage', 'Cauliflower', 'Chickpeas', 'Cocoa (powder)',
       'Coffee', 'Coffee (instant)', 'Cucumbers (greenhouse)', 'Eggs',
       'Fish (fresh)', 'Garlic', 'Groundnuts (shelled)', 'Lentils',
       'Meat (chicken)', 'Meat (mutton)', 'Meat (veal)',
       'Milk (pasteurized)', 'Oil (olive)', 'Oil (sunflower)', 'Onions',
       'Oranges', 'Pasta', 'Peas (green, dry)', 'Rice', 'Salt', 'Sugar',
       'Tea', 'Tea (green)', 'Tomatoes', 'Wheat flour', 'Bread (pita)',
       'Cheese', 'Eggplants', 'Milk (powder, infant formula)', 'Potatoes',
       'Spinach', 'Tomatoes (paste)', 'Yogurt', 'Apples', 'Cucumbers',
       'Tea (herbal)', 'Water', 'Zucchini'], dtype=object)

In [58]:
# food_wage_2 is a subset that takes into consideration how much of the minimum wage
# each food represents (in percentage) throughout the years.

food_wage_2 = food_wage_filtered.copy().drop(['country', 'month', 'min_wage_dollar'], axis=1).reset_index(drop=True)
food_wage_2['price_salary_pct'] = 100* food_wage_2['price'] / food_wage_2['min_wage'] 
food_wage_2.sample(4)


Unnamed: 0,product,unit,year,price,min_wage,price_salary_pct
295,Eggs,Unit,2020,1.1,2943.0,0.037377
156,Bread (pita),KG,2017,2.48,1777.5,0.139522
110,Apples (red),KG,2016,2.83,1647.0,0.171828
65,Tea (green),Unit,2014,4.09,1071.0,0.381886


In [17]:
# Creating a subset of the variation in minimum wage in Turkish lira (national currency) and dollars.
# I know this could be transformed into a function, I just couldn't manage 
# my time so as to do it.
 
wage_variation = pd.DataFrame()

wage_variation.insert(0,'year', food_wage_filtered['year'].unique(), True)
wage_variation.insert(1,'min_wage', food_wage_filtered['min_wage'].unique(), True)
wage_variation.insert(2,'min_wage_dollar', food_wage_filtered['min_wage_dollar'].unique(), True)
wage_variation.insert(3,'%_var_wage', round((wage_variation['min_wage'].pct_change())*100, 1), True)
wage_variation.insert(4,'%_var_wage_usd', round((wage_variation['min_wage_dollar'].pct_change())*100, 1), True)

wage_variation

Unnamed: 0,year,min_wage,min_wage_dollar,%_var_wage,%_var_wage_usd
0,2013,978.6,536.6,,
1,2014,1071.0,518.2,9.4,-3.4
2,2015,1201.5,468.2,12.2,-9.6
3,2016,1647.0,545.3,37.1,16.5
4,2017,1777.5,487.2,7.9,-10.7
5,2018,2029.5,420.3,14.2,-13.7
6,2019,2558.4,450.9,26.1,7.3
7,2020,2943.0,419.9,15.0,-6.9
8,2021,3577.5,404.2,21.6,-3.7


In [18]:
# Calculating the minimum wage variation between 2015 and 2021.

wage_variation_15_21 = round(100* ( wage_variation.loc[wage_variation.year==2021, 'min_wage'].values[0] \
                       - wage_variation.loc[wage_variation.year==2015, 'min_wage'].values[0]) \
                       / wage_variation.loc[wage_variation.year==2015, 'min_wage'].values[0] ,1)
wage_variation_15_21

197.8

In [19]:
# Calculating the price variation between 2015 and 2021

food_price_variation_2015 = food_wage_filtered[(food_wage_filtered['year'] == 2015)][["product", "price"]]
food_price_variation_2015.reset_index(drop=True).head(4)

food_price_variation_2021 = food_wage_filtered[food_wage_filtered['year'] == 2021][["product", "price"]]
food_price_variation_2021.reset_index(drop=True).head(4)


Unnamed: 0,product,price
0,Apples,6.29
1,Bananas,16.19
2,Beans (white),17.72
3,Bread (common),8.4


In [20]:
# Calculating the price variation between 2015 and 2021

food_price_variation  = food_price_variation_2015.merge(food_price_variation_2021, on="product")
food_price_variation["%_var_price"] = food_price_variation.apply(lambda x: round((x['price_y'] - x['price_x'])/x['price_x']*100, 1), axis=1)

food_price_variation.head(4)

Unnamed: 0,product,price_x,price_y,%_var_price
0,Bananas,5.88,16.19,175.3
1,Beans (white),7.66,17.72,131.3
2,Bread (common),3.14,8.4,167.5
3,Bulgur,2.73,6.35,132.6


In [21]:
# food_price_variation_top10 is a subset of the 10 products that most varied in price between 2015-2021

food_price_variation_top10 = food_price_variation.sort_values(by="%_var_price", ascending=False)[:10]
food_price_variation_top10.reset_index(drop=True)

Unnamed: 0,product,price_x,price_y,%_var_price
0,Fish (fresh),15.11,58.59,287.8
1,Eggplants,2.64,8.29,214.0
2,Oranges,2.51,7.38,194.0
3,Tomatoes,2.75,7.93,188.4
4,Meat (mutton),28.24,79.63,182.0
5,Bananas,5.88,16.19,175.3
6,Eggs,0.4,1.1,175.0
7,Garlic,15.33,41.02,167.6
8,Bread (common),3.14,8.4,167.5
9,Meat (chicken),7.53,19.5,159.0


In [22]:
# food_price_variation_bottom10 is a subset of the 10 products that least varied in price between 2015-2021

food_price_variation_bottom10 = food_price_variation.sort_values(by="%_var_price", ascending=True)[:10]
food_price_variation_bottom10.reset_index(drop=True)


Unnamed: 0,product,price_x,price_y,%_var_price
0,Oil (sunflower),22.74,20.47,-10.0
1,Onions,1.74,2.41,38.5
2,Sugar,4.03,6.14,52.4
3,Oil (olive),27.18,42.24,55.4
4,Tomatoes (paste),6.81,10.87,59.6
5,Potatoes,1.86,2.98,60.2
6,Coffee,35.74,63.81,78.5
7,Cauliflower,3.23,5.79,79.3
8,Spinach,2.88,5.17,79.5
9,Chickpeas,6.24,11.4,82.7


In [23]:
# Renaming the products to fit the API standard that asks for a 'raw' before vegetables and meats.

food_price_variation_top10_ = food_price_variation_top10.copy()
food_price_variation_top10_['product'] = food_price_variation_top10_['product'].astype(str) + ', raw'
food_price_variation_top10_ = food_price_variation_top10_.replace('Bread (common), raw', 'Bread').replace('Meat ', '', regex=True)

# Droping the price columns since they are not need for this subset.

food_price_variation_top10_ = food_price_variation_top10_.drop(['price_x', 'price_y'], axis = 'columns').reset_index(drop=True)
food_price_variation_top10_

Unnamed: 0,product,%_var_price
0,"Fish (fresh), raw",287.8
1,"Eggplants, raw",214.0
2,"Oranges, raw",194.0
3,"Tomatoes, raw",188.4
4,"(mutton), raw",182.0
5,"Bananas, raw",175.3
6,"Eggs, raw",175.0
7,"Garlic, raw",167.6
8,Bread,167.5
9,"(chicken), raw",159.0


In [24]:
# Creating a copy of the top10 dataframe in which the variation of min. wage is added.
# This will be useful to plot a graph comparing the two.

food_salary_variation_top = food_price_variation_top10_.copy()
food_salary_variation_top.loc[len(food_salary_variation_top)] = ['Min. Wage', wage_variation_15_21]
food_salary_variation_top

Unnamed: 0,product,%_var_price
0,"Fish (fresh), raw",287.8
1,"Eggplants, raw",214.0
2,"Oranges, raw",194.0
3,"Tomatoes, raw",188.4
4,"(mutton), raw",182.0
5,"Bananas, raw",175.3
6,"Eggs, raw",175.0
7,"Garlic, raw",167.6
8,Bread,167.5
9,"(chicken), raw",159.0


In [26]:
# Cleaning the dataframe. I know this could be transformed into a function, I just couldn't manage 
# my time in order to do it.

food_price_variation_bottom10_ = food_price_variation_bottom10.copy()
food_price_variation_bottom10_ = food_price_variation_bottom10_.sort_values(by="%_var_price", ascending=True)[:10]
food_price_variation_bottom10_ = food_price_variation_bottom10_.replace('\(', '', regex=True).replace('\)', "", regex=True)
food_price_variation_bottom10_ = food_price_variation_bottom10_.drop(['price_x', 'price_y'], axis = 'columns').reset_index(drop=True)
food_price_variation_bottom10_

Unnamed: 0,product,%_var_price
0,Oil sunflower,-10.0
1,Onions,38.5
2,Sugar,52.4
3,Oil olive,55.4
4,Tomatoes paste,59.6
5,Potatoes,60.2
6,Coffee,78.5
7,Cauliflower,79.3
8,Spinach,79.5
9,Chickpeas,82.7


In [27]:
# Creating a copy of the bottom10 dataframe in which the variation of min. wage is added.
# This will be useful to plot a graph comparing the two.

food_salary_variation_bottom = food_price_variation_bottom10_.copy()
food_salary_variation_bottom.loc[len(food_price_variation_bottom10_)] = ['Min. Wage', wage_variation_15_21]
food_salary_variation_bottom

Unnamed: 0,product,%_var_price
0,Oil sunflower,-10.0
1,Onions,38.5
2,Sugar,52.4
3,Oil olive,55.4
4,Tomatoes paste,59.6
5,Potatoes,60.2
6,Coffee,78.5
7,Cauliflower,79.3
8,Spinach,79.5
9,Chickpeas,82.7


# Using an API that checks the nutritional information of different foods

In [28]:
def api_codes(df):
    
    '''
    Function that gets the food ID code of the top 10 and bottom 10 foods (in terms of price variation)
    and returns a list of these codes so they can be used in the api_food_info
    in order to get the nutritional info.
    '''

    food_codes = []
    url = "https://food-nutrition-information.p.rapidapi.com/foods/search"

    for i in df['product']:

        querystring = {"query": i, "pageSize":"1","pageNumber":"1"}

        headers = {
            "X-RapidAPI-Key": key_api,
            "X-RapidAPI-Host": host_api
        }

        response = requests.request("GET", url, headers=headers, params=querystring)
        food = response.json()

        codes = json_normalize(food)

        food_codes.append(codes['foods'][0][0]['fdcId'])
    
    return food_codes


In [33]:
def api_food_info(food_codes):
    
    '''
    Function that gets the nutritional information for each food ID code found
    in the api_food_info function.
    '''

    food_info = dict()
    list_of_dfs = []
    foods_info_final = pd.DataFrame()


    for i in food_codes:

        food_info = dict()
        url = f"https://food-nutrition-information.p.rapidapi.com/food/{i}"

        headers = {
            "X-RapidAPI-Key": key_api,
            "X-RapidAPI-Host": host_api
        }

        response = requests.request("GET", url, headers=headers)
        info = response.json()

        # Variable 'a' will save the name of the food being registered using the key 'description' in the API dictionary
        a = str(json_normalize(info)['description'])
        a = a.replace('\nName: description, dtype: object', '').replace('0 ', '')

        # 'food_info' will save all the nutritional info for each food in 'food_codes'
        food_info = json_normalize(info)['foodNutrients'][0]
        food_info = pd.DataFrame(food_info)

        # Creating a new columns called 'food' to register the variable 'a'
        # The same name will be repeated throughout this df so when all the food_info df are concatenated we'll have a way
        # of knowing which food has each piece of info
        food_info = food_info.assign(food = a)

        # Getting only the name of the nutrient and its unit
        food_info['nutrients'] = food_info['nutrient'].apply(lambda x : x['name']) 
        food_info['unit'] = food_info['nutrient'].apply(lambda x : x['unitName'])

        # Concatenating the dataframes together
        list_of_dfs.append(food_info)
        foods_info_final = pd.concat(list_of_dfs)



    return foods_info_final

In [34]:
def food_info_df(df):
    
    '''
    Function that cleans the top 10 and bottom 10 dataframes.
    It drops irrelevant columns, filters the nutritional info. relevant to this analysis
    and changes the order of some columns for readability.
    '''

    df.drop(['nutrient', 'type', 'id', 'dataPoints', 'foodNutrientDerivation'], axis = 'columns', inplace=True)
    df.drop(['max', 'min', 'median', 'minYearAcquired', 'nutrientAnalysisDetails', 'loq'], axis = 'columns', inplace=True)

    food_info2 = df[df['nutrients'].isin(['Energy', 'Sugars, total including NLEA', 'Total lipid (fat)'])]

    food_info2.insert(3, 'value', food_info2.pop('amount'))
    food_info2.reset_index(drop=True)

    return food_info2

In [35]:
food_codes_bottom10 = api_codes(food_price_variation_bottom10_)
foods_info_bottom10 = api_food_info(food_codes_bottom10)


In [36]:
food_codes_top10 = api_codes(food_price_variation_top10_)
foods_info_top10 = api_food_info(food_codes_top10)

In [37]:
info_top10 = food_info_df(foods_info_top10)
info_bottom10 = food_info_df(foods_info_bottom10)


In [59]:
info_bottom10

Unnamed: 0,food,nutrients,unit,value,variation
2,"Onions, raw",Energy,kcal,40.0,Bottom
4,"Onions, raw",Total lipid (fat),g,0.1,Bottom
8,"Onions, raw","Sugars, total including NLEA",g,4.24,Bottom
1,SUGAR,Total lipid (fat),g,0.0,Bottom
2,SUGAR,"Sugars, total including NLEA",g,100.0,Bottom
5,SUGAR,Energy,kcal,375.0,Bottom
2,Olive oil,Energy,kcal,884.0,Bottom
4,Olive oil,Total lipid (fat),g,100.0,Bottom
8,Olive oil,"Sugars, total including NLEA",g,0.0,Bottom
1,TOMATOES PASTE,Total lipid (fat),g,0.2,Bottom


In [38]:
info_top10.to_csv("top10foods_nutri_info.csv", index=False)
info_bottom10.to_csv("bottom10foods_nutri_info.csv", index=False)

# Visualizations

## Graph 1: How much did the top 10 and bottom 10 foods varied in price compared with the minimum wage variation?

In order to plot these graphs:  

- I used the subset 'food_salary_variation_top' which takes the 10 products that most varied in price between 2015 and 2021, their percentual variation and the variation of minimum wage in the same period (a single value).  

- I used the subset 'food_salary_variation_bottom' that takes the 10 products that least varied in price between 2015 and 2021, their percentual variation and the variation of minimum wage in the same period (a single value).

In [39]:
if not os.path.exists("images"):
    os.mkdir("images")

In [55]:
fig = px.bar(food_salary_variation_top, x='%_var_price', y='product', color = 'product', title = 'Top 10: food price variation x min. wage variation')
fig.write_image("images/fig1.jpeg")
fig.show() 

In [56]:
fig5 = px.bar(food_salary_variation_bottom, x='%_var_price', y='product', color = 'product', title = 'Bottom 10: food price variation x min. wage variation')
fig5.write_image("images/fig5.jpeg")
fig5.show() 

**Observations:**

- Only 2 products had a higher percentual variation more than the minimum wage: eggplants and fish. These are two foods usually considered to be healthy.

- Among the products that least varied in price are sunflower oil (with a negative variation) and sugar, both normally seen as being harmful to our health (if eaten in excess).

## Graph 3:  What's the relationship between food price variation and minimum wage variation like?

In order to plot this graph:

- I used the 'food_wage_2' subset that takes into consideration how much of the minimum wage each food represents (in percentage) throughout the years.

- In opposition to the previous graphs, only the 6 top and bottom products were plotted. That is because otherwise it would have had too much noise and made it difficult to read.

In [57]:
# Df will be used to plot consumer's purchasing power related to the top 6 and bottom 6 foods.
#

food_wage_4 = food_wage_2[food_wage_2['product'].isin((food_price_variation_bottom10['product'][:6]).tolist())]

food_wage_5 = food_wage_2[food_wage_2['product'].isin((food_price_variation_top10['product'][:6]).tolist())]


fig2 = px.line(food_wage_5, x='year', y='price_salary_pct', color='product', title = 'Purchasing power variation: foods w/ biggest increase')
fig2.write_image("images/fig2.jpeg")
fig2.show()

fig3 = px.line(food_wage_4, x='year', y='price_salary_pct', color='product', title = 'Purchasing power variation: foods w/ smallest increase')
fig3.write_image("images/fig3.jpeg")
fig3.show()


The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.


The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



**Observations:**

- Meat and fish had large variations of price and also represent a significant portion of the minimum wage.

- Although sunflower oil had the smallest price variation of all the products, it represents a large part of the minimum wage when compared to the rest.

## Graph 5: Do less caloric foods vary more or less in price?

In order to plot this graph: 

- First I assigned 'labels' ('top' and 'bottom') to differentiate on the graph the foods that varied more and less in price between 2015 and 2021;

- Then, I concatenated the two dataframes that had the nutritional information of the top 10 and bottom 10 foods (in terms of price variation). For the purposes of this analysis, I only took into consideration the calories per serving (assumes: 100g) of each food. A more thorough analysis would also take into consideration other nutritional aspects;  

- Finally, I plotted the bar graph that displays the calories of each food and if it is part of the top 10 or bottom 10 group.


In [43]:
info_top10 = info_top10.assign(variation = 'Top')
info_bottom10 = info_bottom10.assign(variation = 'Bottom')

In [48]:
info_bottom10.sample(4)

Unnamed: 0,food,nutrients,unit,value,variation
1,TOMATOES PASTE,Total lipid (fat),g,0.2,Bottom
12,POTATOES,"Sugars, total including NLEA",g,0.68,Bottom
4,TOMATOES PASTE,"Sugars, total including NLEA",g,11.0,Bottom
9,POTATOES,Energy,kcal,74.0,Bottom


In [49]:
joined_info_food = pd.concat([info_top10, info_bottom10])
joined_info_food.sample(4)

Unnamed: 0,food,nutrients,unit,value,variation
4,"Orange, raw",Total lipid (fat),g,0.12,Top
3,"Bananas, raw",Energy,kJ,371.0,Top
2,BREAD,Total lipid (fat),g,7.06,Top
5,SUGAR,Energy,kcal,375.0,Bottom


In [46]:
joined_info_food_2 = joined_info_food.loc[joined_info_food['unit'] == "kcal"].drop(['nutrients', 'unit'], axis='columns').rename(columns = {'value':'kcal'})

joined_info_food_2 = joined_info_food_2.sort_values(by = 'kcal')

In [47]:
fig4 = px.bar(joined_info_food_2, x='kcal', y='food', color='variation')
fig4.write_image("images/fig4.jpeg")
fig4.show()


**Observations:**

- The relationship between calories and variation in price is not conclusive. Some less caloric foods (usually read as 'healthy', such as vegetables and fish) seem to variate more in price than the more caloric ones (such as oil and sugar). This could implicate in a difficulty of access to healthier foods in lower classes. However, this correlation is not true for all foods and, therefore, it is not possible to conclude that it exists.  

- A better way to visualize this relationship would be by comparing the calories and the price variation on the same graph.