In [1]:
import requests
from bs4 import BeautifulSoup

处理网站返回的数据，把string转换成float

In [2]:
import re

def process_dict(input_dict):
    result = {}
    for key in input_dict:
        value_list = input_dict[key]
        for string in value_list:
            cleaned_string_list = re.findall(r'\d+(?:\.\d+)?', string)
            float_values = [float(num) for num in cleaned_string_list if num]
            result[key] = float_values
    return result

In [3]:
def process_dict_b(input_dict):
    result = {}
    for key in input_dict:
        value_list = input_dict[key]
        float_values = []
        for string in value_list:
            cleaned_string_list = re.sub(r'[^0-9.]+', '', string)
            if cleaned_string_list:
                float_values.append(cleaned_string_list)
            else:
                float_values.append(None)
        result[key] = float_values
    return result

从html里提取表格数据

In [4]:
def extra_table(soup,id,number):
    # Find the table with id 'macronutrients-table'
    table = soup.find('table', {'id': id})
    temp = {}
    # Find the table rows in the table body
    rows = table.tbody.find_all('tr')

    # Iterate over the rows and extract the macronutrient data
    # Iterate over the rows and extract the macronutrient data
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == number:
            temp[cells[0].text.strip()] = [cells[i].text.strip() for i in range(1,number)]
    return temp

In [5]:
def pull(sex,age,feet,inches,pounds,activity_level):
    url = "https://www.nal.usda.gov/human-nutrition-and-food-safety/dri-calculator"

    # 构造POST请求参数
    payload = {
        "measurement_units": "std",
        "sex": sex,
        "age_value": age,
        "age_type": "yrs",
        "feet": feet,
        "inches": inches,
        "cm": None,
        "pounds": pounds,
        "kilos": None,
        "activity_level": activity_level,
        "op": "Submit",
        "form_build_id": None,
        "form_id": "dri_calculator_form"
    }


    # 发送POST请求
    response = requests.post(url, data=payload)

    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    return extra_table(soup,"macronutrients-table",2),extra_table(soup, "vitamins-table", 3),extra_table(soup, "minerals-table", 3)

In [6]:
a,b,c = pull("male",30,5,7,200,"Active")

In [7]:
a

{'Carbohydrate': ['374 - 541 grams  More Information About Carbohydrate'],
 'Total Fiber': ['38 grams'],
 'Protein': ['73 grams'],
 'Fat': ['74 - 129 grams  More Information About Fat'],
 'Saturated fatty acids': ['As low as possible while consuming a nutritionally adequate diet.'],
 'Trans fatty acids': ['As low as possible while consuming a nutritionally adequate diet.'],
 'α-Linolenic Acid': ['1.6 grams  More Information About Alpha_linolenic_acid'],
 'Linoleic Acid': ['17 grams  More Information About Linoleic_acid'],
 'Dietary Cholesterol': ['As low as possible while consuming a nutritionally adequate diet.'],
 'Total Water': ['3.7 liters  (about 16 cups) More Information About Water']}

In [8]:
a
a['Carbs (g)'] = a.pop('Carbohydrate')
a['Fiber (g)'] = a.pop('Total Fiber')

a['Protein (g)'] = a.pop('Protein')
a['Fat (g)'] = a.pop('Fat')
a['Trans-Fats (g)'] = a.pop('Trans fatty acids')
a['Cholesterol (mg)'] = a.pop('Dietary Cholesterol')
a['water'] = a.pop('Total Water')
a

{'Saturated fatty acids': ['As low as possible while consuming a nutritionally adequate diet.'],
 'α-Linolenic Acid': ['1.6 grams  More Information About Alpha_linolenic_acid'],
 'Linoleic Acid': ['17 grams  More Information About Linoleic_acid'],
 'Carbs (g)': ['374 - 541 grams  More Information About Carbohydrate'],
 'Fiber (g)': ['38 grams'],
 'Protein (g)': ['73 grams'],
 'Fat (g)': ['74 - 129 grams  More Information About Fat'],
 'Trans-Fats (g)': ['As low as possible while consuming a nutritionally adequate diet.'],
 'Cholesterol (mg)': ['As low as possible while consuming a nutritionally adequate diet.'],
 'water': ['3.7 liters  (about 16 cups) More Information About Water']}

In [9]:
a = process_dict(a)
a

{'Saturated fatty acids': [],
 'α-Linolenic Acid': [1.6],
 'Linoleic Acid': [17.0],
 'Carbs (g)': [374.0, 541.0],
 'Fiber (g)': [38.0],
 'Protein (g)': [73.0],
 'Fat (g)': [74.0, 129.0],
 'Trans-Fats (g)': [],
 'Cholesterol (mg)': [],
 'water': [3.7, 16.0]}

In [10]:
b

{'Vitamin A': ['900 mcg', '3,000 mcg More Information About Vitamin_a'],
 'Vitamin C': ['90 mg', '2,000 mg'],
 'Vitamin D': ['15 mcg', '100 mcg'],
 'Vitamin B6': ['1.3 mg', '100 mg'],
 'Vitamin E': ['15 mg', '1,000 mg More Information About Vitamin_e'],
 'Vitamin K': ['120 mcg', '0 mcg'],
 'Thiamin': ['1.2 mg', '0 mg'],
 'Vitamin B12': ['2.4 mcg', '0 mcg'],
 'Riboflavin': ['1.3 mg', '0 mg'],
 'Folate': ['400 mcg', '1,000 mcg More Information About Folate'],
 'Niacin': ['16 mg', '35 mg More Information About Niacin'],
 'Choline': ['0.55 g', '3.5 g'],
 'Pantothenic Acid': ['5 mg', '0 mg'],
 'Biotin': ['30 mcg', '0 mcg'],
 'Carotenoids': ['NA', 'ND More Information About Carotenoids']}

In [11]:
b['Vitamin A (IU)'] = b.pop('Vitamin A')
b['Vitamin C (mg)'] = b.pop('Vitamin C')
b['Vitamin D (IU)'] = b.pop('Vitamin D')
b['B6 (Pyridoxine) (mg)'] = b.pop('Vitamin B6')
b['Vitamin E (mg)'] = b.pop('Vitamin E')
b['Vitamin K (µg)'] = b.pop('Vitamin K')
b['B1 (Thiamine) (mg)'] = b.pop('Thiamin')
b['B12 (Cobalamin) (µg)'] = b.pop('Vitamin B12')
b['B2 (Riboflavin) (mg)'] = b.pop('Riboflavin')
b['Folate (µg)'] = b.pop('Folate')
b['B3 (Niacin) (mg)'] = b.pop('Niacin')
b['B5 (Pantothenic Acid) (mg)'] = b.pop('Pantothenic Acid')
b

{'Choline': ['0.55 g', '3.5 g'],
 'Biotin': ['30 mcg', '0 mcg'],
 'Carotenoids': ['NA', 'ND More Information About Carotenoids'],
 'Vitamin A (IU)': ['900 mcg', '3,000 mcg More Information About Vitamin_a'],
 'Vitamin C (mg)': ['90 mg', '2,000 mg'],
 'Vitamin D (IU)': ['15 mcg', '100 mcg'],
 'B6 (Pyridoxine) (mg)': ['1.3 mg', '100 mg'],
 'Vitamin E (mg)': ['15 mg', '1,000 mg More Information About Vitamin_e'],
 'Vitamin K (µg)': ['120 mcg', '0 mcg'],
 'B1 (Thiamine) (mg)': ['1.2 mg', '0 mg'],
 'B12 (Cobalamin) (µg)': ['2.4 mcg', '0 mcg'],
 'B2 (Riboflavin) (mg)': ['1.3 mg', '0 mg'],
 'Folate (µg)': ['400 mcg', '1,000 mcg More Information About Folate'],
 'B3 (Niacin) (mg)': ['16 mg', '35 mg More Information About Niacin'],
 'B5 (Pantothenic Acid) (mg)': ['5 mg', '0 mg']}

In [12]:
b = process_dict_b(b)
b

{'Choline': ['0.55', '3.5'],
 'Biotin': ['30', '0'],
 'Carotenoids': [None, None],
 'Vitamin A (IU)': ['900', '3000'],
 'Vitamin C (mg)': ['90', '2000'],
 'Vitamin D (IU)': ['15', '100'],
 'B6 (Pyridoxine) (mg)': ['1.3', '100'],
 'Vitamin E (mg)': ['15', '1000'],
 'Vitamin K (µg)': ['120', '0'],
 'B1 (Thiamine) (mg)': ['1.2', '0'],
 'B12 (Cobalamin) (µg)': ['2.4', '0'],
 'B2 (Riboflavin) (mg)': ['1.3', '0'],
 'Folate (µg)': ['400', '1000'],
 'B3 (Niacin) (mg)': ['16', '35'],
 'B5 (Pantothenic Acid) (mg)': ['5', '0']}

In [13]:
c

{'Calcium': ['1,000 mg', '2,500 mg'],
 'Chloride': ['2.3 g', '3.6 g'],
 'Chromium': ['35 mcg', '0 mcg'],
 'Copper': ['900 mcg', '10,000 mcg'],
 'Fluoride': ['4 mg', '10 mg'],
 'Iodine': ['150 mcg', '1,100 mcg'],
 'Iron': ['8 mg', '45 mg'],
 'Magnesium': ['400 mg', '350 mg More Information About Magnesium'],
 'Manganese': ['2.3 mg', '11 mg'],
 'Molybdenum': ['45 mcg', '2,000 mcg'],
 'Phosphorus': ['0.7 g', '4 g'],
 'Potassium': ['3,400 mg', '0 mg'],
 'Selenium': ['55 mcg', '400 mcg'],
 'Sodium': ['1,500 mg', '0 mg'],
 'Zinc': ['11 mg', '40 mg'],
 'Arsenic': ['NA', 'ND More Information About Arsenic'],
 'Boron': ['0 mg', '20 mg'],
 'Nickel': ['0 mg', '1 mg'],
 'Silicon': ['NA', 'ND More Information About Silicon'],
 'Sulfate': ['NA', 'ND'],
 'Vanadium': ['0 mg', '1.8 mg More Information About Vanadium']}

In [14]:
c['Calcium (mg)'] = c.pop('Calcium')
c['Copper (mg)'] = c.pop('Chloride')
c['Iron (mg)'] = c.pop('Iron')
c['Magnesium (mg)'] = c.pop('Magnesium')
c['Manganese (mg)'] = c.pop('Manganese')
c['Phosphorus (mg)'] = c.pop('Phosphorus')
c['Potassium (mg)'] = c.pop('Potassium')
c['Selenium (µg)'] = c.pop('Selenium')
c['Sodium (mg)'] = c.pop('Sodium')
c['Zinc (mg)'] = c.pop('Zinc')
c

{'Chromium': ['35 mcg', '0 mcg'],
 'Copper': ['900 mcg', '10,000 mcg'],
 'Fluoride': ['4 mg', '10 mg'],
 'Iodine': ['150 mcg', '1,100 mcg'],
 'Molybdenum': ['45 mcg', '2,000 mcg'],
 'Arsenic': ['NA', 'ND More Information About Arsenic'],
 'Boron': ['0 mg', '20 mg'],
 'Nickel': ['0 mg', '1 mg'],
 'Silicon': ['NA', 'ND More Information About Silicon'],
 'Sulfate': ['NA', 'ND'],
 'Vanadium': ['0 mg', '1.8 mg More Information About Vanadium'],
 'Calcium (mg)': ['1,000 mg', '2,500 mg'],
 'Copper (mg)': ['2.3 g', '3.6 g'],
 'Iron (mg)': ['8 mg', '45 mg'],
 'Magnesium (mg)': ['400 mg', '350 mg More Information About Magnesium'],
 'Manganese (mg)': ['2.3 mg', '11 mg'],
 'Phosphorus (mg)': ['0.7 g', '4 g'],
 'Potassium (mg)': ['3,400 mg', '0 mg'],
 'Selenium (µg)': ['55 mcg', '400 mcg'],
 'Sodium (mg)': ['1,500 mg', '0 mg'],
 'Zinc (mg)': ['11 mg', '40 mg']}

In [15]:
c = process_dict_b(c)
c

{'Chromium': ['35', '0'],
 'Copper': ['900', '10000'],
 'Fluoride': ['4', '10'],
 'Iodine': ['150', '1100'],
 'Molybdenum': ['45', '2000'],
 'Arsenic': [None, None],
 'Boron': ['0', '20'],
 'Nickel': ['0', '1'],
 'Silicon': [None, None],
 'Sulfate': [None, None],
 'Vanadium': ['0', '1.8'],
 'Calcium (mg)': ['1000', '2500'],
 'Copper (mg)': ['2.3', '3.6'],
 'Iron (mg)': ['8', '45'],
 'Magnesium (mg)': ['400', '350'],
 'Manganese (mg)': ['2.3', '11'],
 'Phosphorus (mg)': ['0.7', '4'],
 'Potassium (mg)': ['3400', '0'],
 'Selenium (µg)': ['55', '400'],
 'Sodium (mg)': ['1500', '0'],
 'Zinc (mg)': ['11', '40']}

In [16]:
def process_data(a,b,c):
    a['Carbs (g)'] = a.pop('Carbohydrate')
    a['Fiber (g)'] = a.pop('Total Fiber')
    a['Protein (g)'] = a.pop('Protein')
    a['Fat (g)'] = a.pop('Fat')
    a['Trans-Fats (g)'] = a.pop('Trans fatty acids')
    a['Cholesterol (mg)'] = a.pop('Dietary Cholesterol')
    a['water'] = a.pop('Total Water')
    
    b['Vitamin A (IU)'] = b.pop('Vitamin A')
    b['Vitamin C (mg)'] = b.pop('Vitamin C')
    b['Vitamin D (IU)'] = b.pop('Vitamin D')
    b['B6 (Pyridoxine) (mg)'] = b.pop('Vitamin B6')
    b['Vitamin E (mg)'] = b.pop('Vitamin E')
    b['Vitamin K (µg)'] = b.pop('Vitamin K')
    b['B1 (Thiamine) (mg)'] = b.pop('Thiamin')
    b['B12 (Cobalamin) (µg)'] = b.pop('Vitamin B12')
    b['B2 (Riboflavin) (mg)'] = b.pop('Riboflavin')
    b['Folate (µg)'] = b.pop('Folate')
    b['B3 (Niacin) (mg)'] = b.pop('Niacin')
    b['B5 (Pantothenic Acid) (mg)'] = b.pop('Pantothenic Acid')
    
    c['Calcium (mg)'] = c.pop('Calcium')
    c['Copper (mg)'] = c.pop('Chloride')
    c['Iron (mg)'] = c.pop('Iron')
    c['Magnesium (mg)'] = c.pop('Magnesium')
    c['Manganese (mg)'] = c.pop('Manganese')
    c['Phosphorus (mg)'] = c.pop('Phosphorus')
    c['Potassium (mg)'] = c.pop('Potassium')
    c['Selenium (µg)'] = c.pop('Selenium')
    c['Sodium (mg)'] = c.pop('Sodium')
    c['Zinc (mg)'] = c.pop('Zinc')
    a =  process_dict(a)
    b =  process_dict_b(b)
    c =  process_dict_b(c)
    return a,b,c

    