In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import html
import lxml

import re
import numpy as np
import pandas as pd

import sys, os

# Helpers
abspath = os.path.abspath
dirname = os.path.dirname
sep = os.sep

# Update sys.path for in-house libraries
folder_ = dirname(abspath(os.getcwd()))
for i in range(1): folder_ = dirname(folder_)
sys.path.append(folder_)

# In-house libraries
import src.utils.mining_data_tb as md
import src.utils.folder_tb as fo

In [2]:
url = "https://www.eatforhealth.gov.au/node/1813927/done?sid=806762&token=2544c5968ac9116d0ee0e2af1aa91200"

r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [5]:
# di -> daily intake

# Get daily intake table
di_table = soup.find(id = "tbl-calc")

# Get all the rows in the table
di_rows = di_table.find_all("tr")

# To store all the information
di_dict = {}

for row in di_rows:
    items = row.find_all("td")
    if len(items) > 1:
        di_dict[items[0].text] = items[1].text

di_dict

{'Protein': '64 g/day*',
 'Fluids(Including plain water, milk and other drinks)': '2.3 L/day**',
 'Fibre': '30 g/day**',
 'Vitamin A': '900 μg/day of retinol equivalents',
 'Thiamin': '1.2 mg/day*',
 'Riboflavin': '1.3 mg/day*',
 'Niacin': '16 mg/day of niacin equivalents',
 'Vitamin B6': '1.3 mg/day*',
 'Vitamin B12': '2.4 μg/day*',
 'Folate': '400 μg/day as dietary folate equivalents',
 'Vitamin C': '45 mg/day*',
 'Calcium': '1000 mg/day*',
 'Iodine': '150 μg/day*',
 'Iron': '8 mg/day*',
 'Magnesium': '400 mg/day*',
 'Potassium': '3800 mg/day*',
 'Sodium': '460-920 mg/day*',
 'Zinc': '14 mg/day*'}

In [24]:
# For testing purposes
s = pd.Series(di_dict)
df = pd.DataFrame([s, s], index = [0, 1])
df.applymap(md.num_cleaning)

Unnamed: 0,Protein,"Fluids(Including plain water, milk and other drinks)",Fibre,Vitamin A,Thiamin,Riboflavin,Niacin,Vitamin B6,Vitamin B12,Folate,Vitamin C,Calcium,Iodine,Iron,Magnesium,Potassium,Sodium,Zinc
0,64,2.3,30,900,1.2,1.3,16,1.3,2.4,400,45,1000,150,8,400,3800,460,14
1,64,2.3,30,900,1.2,1.3,16,1.3,2.4,400,45,1000,150,8,400,3800,460,14


In [41]:
class daily_intake:
    def __init__(self, gender, age):
        self.gender = gender.lower()
        self.age = age
        self.url = None
        self.data = None

    def __data_selection(self, df):
        '''
        The function goes to the daily intake csv file, where all the links are stored and with the given parameters, returns the corresponding url.

        args :
        gender -> male / female
        age -> multiple of 10, between 20 and 70
        df -> dataframe with the urls
        '''
        self.url = df[(df["gender"] == self.gender) & (df["age"] == self.age)]["url"].values[0]

    def __clean_data(self, s):
        # Clear number formats
        self.data = md.mapper(s)

        # Drop unnecessary column
        self.data = self.data.drop("Iodine")

        # Rename Series object
        self.data.name = "daily_intake"

        # Rename index
        self.data.index = ["Protein (g)", "Water (g)", "Fiber, total dietary (g)", "Vitamin A, RAE (mcg_RAE)", "Thiamin (mg)", "Riboflavin (mg)", "Niacin (mg)", "Vitamin B-6 (mg)", "Vitamin B-12 (mcg)", "Folate, total (mcg)", "Vitamin C (mg)", "Calcium (mg)", "Iron (mg)", "Magnesium (mg)", "Potassium (mg)", "Sodium (mg)", "Zinc (mg)"]
        
        # Transform liter values to gram (for consistency purposes)
        self.data["Water (g)"] = md.liter_to_gram(self.data["Water (g)"])

    def get_data(self, df):
        '''
        This function takes the url (return by pick_daily_intake) and pulls the daily intake data from it. It returns a pandas Series

        args :
        url -> url where daily intake data is stored
        '''
        self.__data_selection(df)

        r = requests.get(self.url)
        soup = BeautifulSoup(r.text, "lxml")

        di_table = soup.find(id = "tbl-calc")
        di_rows = di_table.find_all("tr")

        di_dict = {}

        for row in di_rows:
            items = row.find_all("td")
            if len(items) > 1:
                di_dict[items[0].text] = items[1].text

        s = pd.Series(di_dict)
        self.__clean_data(s)

        return self.data

In [2]:
# Everything together (after moving functions to the modules)
environment_data_path = fo.path_to_folder(2, "data" + sep + "environment")
daily_intake_df = pd.read_csv(environment_data_path + "daily_intakes.csv")

w_30 = md.daily_intake("female", 30)
w_30.get_data(daily_intake_df)  

Protein (g)                   46.0
Water (g)                   2100.0
Fiber, total dietary (g)      25.0
Vitamin A, RAE (mcg_RAE)     700.0
Thiamin (mg)                   1.1
Riboflavin (mg)                1.1
Niacin (mg)                   14.0
Vitamin B-6 (mg)               1.3
Vitamin B-12 (mcg)             2.4
Folate, total (mcg)          400.0
Vitamin C (mg)                45.0
Calcium (mg)                1000.0
Iron (mg)                     18.0
Magnesium (mg)               310.0
Potassium (mg)              2800.0
Sodium (mg)                  460.0
Zinc (mg)                      8.0
Name: daily_intake, dtype: float64