Take the remaining nutritional columns in ABBREV.xlsx ( from Water_(g) onwards) and map them to the daily nutritional requirements data in the attached file daily_nutrients.xlsx. Again this can be done by hand or by code. Not everything will have an equivalent.

In [2]:
import numpy as np 
import pandas as pd
import difflib as diff

import matplotlib.pyplot as plt

In [3]:
daily_nutrients = pd.read_excel(io="daily_nutrients.xlsx")
mapping = pd.read_csv(filepath_or_buffer="mapping.csv")

agg_df = pd.read_csv(filepath_or_buffer="Agriculture_Dataset_text.csv", encoding_errors='ignore')
abb_df = pd.read_excel(io="ABBREV.xlsx")

In [62]:
daily_nutrients.head()

Unnamed: 0,Nutrient,Daily Value
0,Calcium,1300mg
1,Dietary Fiber,28g
2,Fat,78g
3,Magnesium,420mg
4,Manganese,2.3mg


In [20]:
daily_nutrients.Nutrient.values

array(['Calcium', 'Dietary Fiber', 'Fat', 'Magnesium', 'Manganese',
       'Phosphorus', 'Potassium', 'Vitamin C', 'Vitamin D', 'Vitamin K',
       'Biotin', 'Chloride', 'Chromium', 'Copper', 'Folate/Folic Acid',
       'Molybdenum', 'Niacin', 'Pantothenic Acid', 'Riboflavin',
       'Selenium', 'Sodium', 'Thiamin', 'Total carbohydrate', 'Vitamin A',
       'Vitamin B6', 'Vitamin B12', 'Vitamin E', 'Zinc', 'Cholesterol',
       'Iodine', 'Iron', 'Protein', 'Saturated fat', 'Added sugars',
       'Choline'], dtype=object)

In [63]:
abb_df.columns

Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
       'Copper_mg)', 'Manganese_(mg)', 'Selenium_(µg)', 'Vit_C_(mg)',
       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Panto_Acid_mg)',
       'Vit_B6_(mg)', 'Folate_Tot_(µg)', 'Folic_Acid_(µg)', 'Food_Folate_(µg)',
       'Folate_DFE_(µg)', 'Choline_Tot_ (mg)', 'Vit_B12_(µg)', 'Vit_A_IU',
       'Vit_A_RAE', 'Retinol_(µg)', 'Alpha_Carot_(µg)', 'Beta_Carot_(µg)',
       'Beta_Crypt_(µg)', 'Lycopene_(µg)', 'Lut+Zea_ (µg)', 'Vit_E_(mg)',
       'Vit_D_µg', 'Vit_D_IU', 'Vit_K_(µg)', 'FA_Sat_(g)', 'FA_Mono_(g)',
       'FA_Poly_(g)', 'Cholestrl_(mg)', 'GmWt_1', 'GmWt_Desc1', 'GmWt_2',
       'GmWt_Desc2', 'Refuse_Pct'],
      dtype='object')

In [69]:
# There is a mistake in the column names of ABBREV: Copper_mg), Panto_Acid_mg), Choline_Tot_ (mg)
pairs = [["Protein_(g)", "Protein"], ["Lipid_Tot_(g)", "Fat"], ["Carbohydrt_(g)", "Total carbohydrate"], 
        ["Fiber_TD_(g)", "Dietary Fiber"], ["Calcium_(mg)", "Calcium"], ["Iron_(mg)", "Iron"], ["Magnesium_(mg)", "Magnesium"],
        ["Phosphorus_(mg)", "Phosphorus"], ["Potassium_(mg)", "Potassium"], 
        ["Sodium_(mg)", "Sodium"], ["Zinc_(mg)", "Zinc"], ["Copper_mg)", "Copper"],
        ["Manganese_(mg)", "Manganese"], ["Selenium_(µg)", "Selenium"], ["Vit_C_(mg)", "Vitamin C"],
        ["Thiamin_(mg)", "Thiamin"], ["Riboflavin_(mg)", "Riboflavin"], ["Niacin_(mg)", "Niacin"],
        ["Panto_Acid_mg)", "Pantothenic Acid"], ["Vit_B6_(mg)", "Vitamin B6"], ["Folate_Tot_(µg)", "Folate/Folic Acid"],
        ["Folic_Acid_(µg)", "Folate/Folic Acid"], ["Food_Folate_(µg)", "Folate/Folic Acid"], 
        ["Folate_DFE_(µg)", "Folate/Folic Acid"], ["Choline_Tot_ (mg)", "Choline"], ["Vit_B12_(µg)", "Vitamin B12"],
        ["Vit_A_IU", "Vitamin A"], ["Vit_E_(mg)", "Vitamin E"], ["Vit_D_µg", "Vitamin D"], ["Vit_D_IU", "Vitamin D"],
        ["Vit_K_(µg)", "Vitamin K"], ["FA_Sat_(g)", "Saturated fat"], ["Cholestrl_(mg)", "Cholesterol"]]

In [70]:
pairs

[['Protein_(g)', 'Protein'],
 ['Lipid_Tot_(g)', 'Fat'],
 ['Carbohydrt_(g)', 'Total carbohydrate'],
 ['Fiber_TD_(g)', 'Dietary Fiber'],
 ['Calcium_(mg)', 'Calcium'],
 ['Iron_(mg)', 'Iron'],
 ['Magnesium_(mg)', 'Magnesium'],
 ['Phosphorus_(mg)', 'Phosphorus'],
 ['Potassium_(mg)', 'Potassium'],
 ['Sodium_(mg)', 'Sodium'],
 ['Zinc_(mg)', 'Zinc'],
 ['Copper_mg)', 'Copper'],
 ['Manganese_(mg)', 'Manganese'],
 ['Selenium_(µg)', 'Selenium'],
 ['Vit_C_(mg)', 'Vitamin C'],
 ['Thiamin_(mg)', 'Thiamin'],
 ['Riboflavin_(mg)', 'Riboflavin'],
 ['Niacin_(mg)', 'Niacin'],
 ['Panto_Acid_mg)', 'Pantothenic Acid'],
 ['Vit_B6_(mg)', 'Vitamin B6'],
 ['Folate_Tot_(µg)', 'Folate/Folic Acid'],
 ['Folic_Acid_(µg)', 'Folate/Folic Acid'],
 ['Food_Folate_(µg)', 'Folate/Folic Acid'],
 ['Folate_DFE_(µg)', 'Folate/Folic Acid'],
 ['Choline_Tot_ (mg)', 'Choline'],
 ['Vit_B12_(µg)', 'Vitamin B12'],
 ['Vit_A_IU', 'Vitamin A'],
 ['Vit_E_(mg)', 'Vitamin E'],
 ['Vit_D_µg', 'Vitamin D'],
 ['Vit_D_IU', 'Vitamin D'],
 ['Vit_K_

In [71]:
# Transpose the matrix - two adjacent columns 
pairs = np.array(pairs)
pairs = pairs.transpose()

# Check that the column names are accurate 
for i in range(len(pairs[1])):
    check1 = pairs[0][i] in abb_df.columns
    check2 = pairs[1][i] in daily_nutrients.Nutrient.values
    
    print(pairs[0][i], ": ", check1)
    print(pairs[1][i], ": ", check2)

Protein_(g) :  True
Protein :  True
Lipid_Tot_(g) :  True
Fat :  True
Carbohydrt_(g) :  True
Total carbohydrate :  True
Fiber_TD_(g) :  True
Dietary Fiber :  True
Calcium_(mg) :  True
Calcium :  True
Iron_(mg) :  True
Iron :  True
Magnesium_(mg) :  True
Magnesium :  True
Phosphorus_(mg) :  True
Phosphorus :  True
Potassium_(mg) :  True
Potassium :  True
Sodium_(mg) :  True
Sodium :  True
Zinc_(mg) :  True
Zinc :  True
Copper_mg) :  True
Copper :  True
Manganese_(mg) :  True
Manganese :  True
Selenium_(µg) :  True
Selenium :  True
Vit_C_(mg) :  True
Vitamin C :  True
Thiamin_(mg) :  True
Thiamin :  True
Riboflavin_(mg) :  True
Riboflavin :  True
Niacin_(mg) :  True
Niacin :  True
Panto_Acid_mg) :  True
Pantothenic Acid :  True
Vit_B6_(mg) :  True
Vitamin B6 :  True
Folate_Tot_(µg) :  True
Folate/Folic Acid :  True
Folic_Acid_(µg) :  True
Folate/Folic Acid :  True
Food_Folate_(µg) :  True
Folate/Folic Acid :  True
Folate_DFE_(µg) :  True
Folate/Folic Acid :  True
Choline_Tot_ (mg) :  Tru

In [72]:
# Turn pairs into dataframe 
nutrient_map_df = pd.DataFrame({"abbrev": pairs[0], "nutrient": pairs[1]})

In [73]:
nutrient_map_df

Unnamed: 0,abbrev,nutrient
0,Protein_(g),Protein
1,Lipid_Tot_(g),Fat
2,Carbohydrt_(g),Total carbohydrate
3,Fiber_TD_(g),Dietary Fiber
4,Calcium_(mg),Calcium
5,Iron_(mg),Iron
6,Magnesium_(mg),Magnesium
7,Phosphorus_(mg),Phosphorus
8,Potassium_(mg),Potassium
9,Sodium_(mg),Sodium


In [74]:
# Export this as a dataframe 
nutrient_map_df.to_csv(path_or_buf="abbrev_nutrient_map.csv")

In [75]:
# Read it back in 
test = pd.read_csv(filepath_or_buffer="abbrev_nutrient_map.csv", index_col=0)
test

Unnamed: 0,abbrev,nutrient
0,Protein_(g),Protein
1,Lipid_Tot_(g),Fat
2,Carbohydrt_(g),Total carbohydrate
3,Fiber_TD_(g),Dietary Fiber
4,Calcium_(mg),Calcium
5,Iron_(mg),Iron
6,Magnesium_(mg),Magnesium
7,Phosphorus_(mg),Phosphorus
8,Potassium_(mg),Potassium
9,Sodium_(mg),Sodium


The following code blocks will correct the column names. 

In [13]:
# Correct the column names 
corrected_columns = ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
       'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(µg)', 'Vit_C_(mg)',
       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Panto_Acid_(mg)',
       'Vit_B6_(mg)', 'Folate_Tot_(µg)', 'Folic_Acid_(µg)', 'Food_Folate_(µg)',
       'Folate_DFE_(µg)', 'Choline_Tot_(mg)', 'Vit_B12_(µg)', 'Vit_A_(IU)',
       'Vit_A_(RAE)', 'Retinol_(µg)', 'Alpha_Carot_(µg)', 'Beta_Carot_(µg)',
       'Beta_Crypt_(µg)', 'Lycopene_(µg)', 'Lut+Zea_(µg)', 'Vit_E_(mg)',
       'Vit_D_(µg)', 'Vit_D_(IU)', 'Vit_K_(µg)', 'FA_Sat_(g)', 'FA_Mono_(g)',
       'FA_Poly_(g)', 'Cholestrl_(mg)', 'GmWt_1', 'GmWt_Desc1', 'GmWt_2',
       'GmWt_Desc2', 'Refuse_Pct']

In [15]:
# Set column names of ABBREV to the corrected names 
abb_df.columns = corrected_columns

In [16]:
# Re-produce the mapping 
# There is a mistake in the column names of ABBREV: Copper_mg), Panto_Acid_mg), Choline_Tot_ (mg)
pairs = [["Protein_(g)", "Protein"], ["Lipid_Tot_(g)", "Fat"], ["Carbohydrt_(g)", "Total carbohydrate"], 
        ["Fiber_TD_(g)", "Dietary Fiber"], ["Calcium_(mg)", "Calcium"], ["Iron_(mg)", "Iron"], ["Magnesium_(mg)", "Magnesium"],
        ["Phosphorus_(mg)", "Phosphorus"], ["Potassium_(mg)", "Potassium"], 
        ["Sodium_(mg)", "Sodium"], ["Zinc_(mg)", "Zinc"], ["Copper_(mg)", "Copper"],
        ["Manganese_(mg)", "Manganese"], ["Selenium_(µg)", "Selenium"], ["Vit_C_(mg)", "Vitamin C"],
        ["Thiamin_(mg)", "Thiamin"], ["Riboflavin_(mg)", "Riboflavin"], ["Niacin_(mg)", "Niacin"],
        ["Panto_Acid_(mg)", "Pantothenic Acid"], ["Vit_B6_(mg)", "Vitamin B6"], ["Folate_Tot_(µg)", "Folate/Folic Acid"],
        ["Folic_Acid_(µg)", "Folate/Folic Acid"], ["Food_Folate_(µg)", "Folate/Folic Acid"], 
        ["Folate_DFE_(µg)", "Folate/Folic Acid"], ["Choline_Tot_(mg)", "Choline"], ["Vit_B12_(µg)", "Vitamin B12"],
        ["Vit_A_(IU)", "Vitamin A"], ["Vit_E_(mg)", "Vitamin E"], ["Vit_D_(µg)", "Vitamin D"], ["Vit_D_(IU)", "Vitamin D"],
        ["Vit_K_(µg)", "Vitamin K"], ["FA_Sat_(g)", "Saturated fat"], ["Cholestrl_(mg)", "Cholesterol"]]

In [17]:
# Transpose the matrix - two adjacent columns 
pairs = np.array(pairs)
pairs = pairs.transpose()

# Check that the column names are accurate 
for i in range(len(pairs[1])):
    check1 = pairs[0][i] in abb_df.columns
    check2 = pairs[1][i] in daily_nutrients.Nutrient.values
    
    print(pairs[0][i], ": ", check1)
    print(pairs[1][i], ": ", check2)

Protein_(g) :  True
Protein :  True
Lipid_Tot_(g) :  True
Fat :  True
Carbohydrt_(g) :  True
Total carbohydrate :  True
Fiber_TD_(g) :  True
Dietary Fiber :  True
Calcium_(mg) :  True
Calcium :  True
Iron_(mg) :  True
Iron :  True
Magnesium_(mg) :  True
Magnesium :  True
Phosphorus_(mg) :  True
Phosphorus :  True
Potassium_(mg) :  True
Potassium :  True
Sodium_(mg) :  True
Sodium :  True
Zinc_(mg) :  True
Zinc :  True
Copper_(mg) :  True
Copper :  True
Manganese_(mg) :  True
Manganese :  True
Selenium_(µg) :  True
Selenium :  True
Vit_C_(mg) :  True
Vitamin C :  True
Thiamin_(mg) :  True
Thiamin :  True
Riboflavin_(mg) :  True
Riboflavin :  True
Niacin_(mg) :  True
Niacin :  True
Panto_Acid_(mg) :  True
Pantothenic Acid :  True
Vit_B6_(mg) :  True
Vitamin B6 :  True
Folate_Tot_(µg) :  True
Folate/Folic Acid :  True
Folic_Acid_(µg) :  True
Folate/Folic Acid :  True
Food_Folate_(µg) :  True
Folate/Folic Acid :  True
Folate_DFE_(µg) :  True
Folate/Folic Acid :  True
Choline_Tot_(mg) :  Tr

In [19]:
# Turn pairs into dataframe 
nutrient_map_df = pd.DataFrame({"abbrev": pairs[0], "nutrient": pairs[1]})

# Export this as a dataframe 
nutrient_map_df.to_csv(path_or_buf="abbrev_nutrient_map.csv")


In [20]:

# Write corrected ABBREV to CSV
abb_df.to_csv(path_or_buf="ABBREV_2.csv")

In [21]:
# Read it back in 
test = pd.read_csv(filepath_or_buffer="abbrev_nutrient_map.csv", index_col=0)
test

Unnamed: 0,abbrev,nutrient
0,Protein_(g),Protein
1,Lipid_Tot_(g),Fat
2,Carbohydrt_(g),Total carbohydrate
3,Fiber_TD_(g),Dietary Fiber
4,Calcium_(mg),Calcium
5,Iron_(mg),Iron
6,Magnesium_(mg),Magnesium
7,Phosphorus_(mg),Phosphorus
8,Potassium_(mg),Potassium
9,Sodium_(mg),Sodium
