In [66]:
# import sys
# sys.path.append("../")
import os
import pandas as pd
import numpy as np


## Loss-adjusted Food Availability

In [67]:
def get_paths(dirname: str) -> list[str]:
    all_files = os.listdir(dirname)
    return [f"{dirname}{filename}" for filename in all_files]

def has_numbers(input_str: str) -> bool:
    if input_str.startswith("19") or input_str.startswith("20"):
        return True
    return False

def change_nan_col(df: pd.DataFrame) -> list[str]:
    cols = list(df.columns)
    indx = [i for i, col in enumerate(cols) if col is np.nan]

    if len(indx) == 2:
        new_cols = ["Loss oz/day", "Loss g/day"]
    elif len(indx) == 3:
        new_cols = ["Loss gal/year", "Loss oz/day", "Loss g/day"]
    elif len(indx) == 4:
        new_cols = ["Edible weight", "Other loss", "Loss oz/day", "Loss g/day"]
    elif len(indx) == 5:
        new_cols = ["Edible weight", "Other loss", "Loss gal/year", "Loss oz/day", "Loss g/day"]
    for i, col in zip(indx, new_cols):
        cols[i] = col
        
    return cols

def change_col_names(df: pd.DataFrame) -> dict[str, str]:
    avail_cal = ""
    primary_weight = ""
    for col in list(df.columns):
        if str(col).startswith("Calories"):
            avail_cal = col 
    for col in list(df.columns):
        if str(col).startswith("Primary"):
            primary_weight = col 
            
    new_col_names = {
        "Year": "year",
        primary_weight: "original weight",
        "Edible weight": "edible weight",
        "Total loss, all levels": "total percent loss",
        "Per capita availability adjusted for loss": "loss lbs/year",
        "Loss g/day": "loss g/day",
        avail_cal: "available calories/day"
        
    }
    
    return new_col_names

In [77]:
os.system(f"rm -rf {new_dir}")

0

In [69]:
dirname = "Loss-Adjusted Food Availability/"
new_dir = dirname.lower().replace(" ", "-")[:-1] + "-clean"
os.mkdir(new_dir)

### Total & percentage of calories available per capita

In [70]:
path = "./Loss-Adjusted Food Availability/calories.xls"
for sheet in range(1,3):
    df = pd.read_excel("./Loss-Adjusted Food Availability/calories.xls", sheet_name=sheet)
    sheetnames = pd.ExcelFile("./Loss-Adjusted Food Availability/calories.xls").sheet_names 
    df.columns = df.iloc[0]
    if sheet == 1:
        last_index = df[df['Year'] == 2017].index[0] + 1
    elif sheet == 2:
        last_index = df[df['Year'] == 2010].index[0] + 1
        
    df = df[4: last_index]
    df.reset_index(drop=True, inplace=True)
    index = df.loc[df["Year"] == "2000*"].index[0]
    df.loc[index:index+1,"Year"] = "2000"
    df.to_csv(f"{new_dir}/{sheetnames[sheet]}.csv", index=False)
    

In [71]:
get_paths(dirname)

['Loss-Adjusted Food Availability/meat.xls',
 'Loss-Adjusted Food Availability/sugar.xls',
 'Loss-Adjusted Food Availability/Dairy.xls',
 'Loss-Adjusted Food Availability/servings.xls',
 'Loss-Adjusted Food Availability/fat.xls',
 'Loss-Adjusted Food Availability/veg.xls',
 'Loss-Adjusted Food Availability/Fruit.xls',
 'Loss-Adjusted Food Availability/grain.xls',
 'Loss-Adjusted Food Availability/calories.xls']

### How much loss during the processing of each food group, per capita.

In [72]:
# each file
path_names = get_paths(dirname)
for path in path_names:
    if "calories" in path or "serving" in path:
        continue
    # get sheets to process
    file = pd.ExcelFile(path)
    sheet_names = file.sheet_names[1:]
    
    sub_dir = path[path.rfind("/") + 1: path.rfind(".")].lower()
    dir_path = os.path.join(new_dir, sub_dir)
    print(dir_path)

    try:
        os.mkdir(dir_path)
    except OSError as e:
        print(e)

    # each sheet
    for name in sheet_names:
        # read each sheet in the file
        print(f"- {name}")
        df = pd.read_excel(path, sheet_name=name)
        # use title as key 
        # title is header of each sheet
        filename = df.columns[0].split(":")[0].lower()
        filename = filename.replace(" ", "-").replace("/", "-")

        df.columns = df.iloc[0]
        filters = df['Year'].apply(lambda x: has_numbers(str(x)))
        df = df[filters]               
        
            
        df.columns = change_nan_col(df)
        new_col_names = change_col_names(df)
        df = df.rename(columns=new_col_names)
            
        final_cols = ["year", "original weight", "edible weight", "total percent loss", "loss lbs/year", "loss g/day", "available calories/day"]
        
        # total grains has no edible weight column 
        if name == 'Total grains':
            final_cols.remove("edible weight")
        df = df[final_cols]
        print(f"{dir_path}/{filename}.csv")
        df.to_csv(f"{dir_path}/{filename}.csv", index=False)

loss-adjusted-food-availability-clean/meat
- Beef
loss-adjusted-food-availability-clean/meat/beef.csv
- Veal
loss-adjusted-food-availability-clean/meat/veal.csv
- Pork
loss-adjusted-food-availability-clean/meat/pork.csv
- Lamb
loss-adjusted-food-availability-clean/meat/lamb.csv
- Red meat
loss-adjusted-food-availability-clean/meat/red-meat.csv
- Chicken
loss-adjusted-food-availability-clean/meat/chicken.csv
- Turkey
loss-adjusted-food-availability-clean/meat/turkey.csv
- Poultry
loss-adjusted-food-availability-clean/meat/poultry.csv
- Fresh and frozen fish
loss-adjusted-food-availability-clean/meat/fresh-and-frozen-fish.csv
- Fresh and frozen shellfish
loss-adjusted-food-availability-clean/meat/fresh-and-frozen-shellfish.csv
- Total Fresh and Frozen Fish
loss-adjusted-food-availability-clean/meat/total-fresh-and-frozen-fish-and-shellfish.csv
- Canned Salmon
loss-adjusted-food-availability-clean/meat/canned-salmon.csv
- Canned Sardines
loss-adjusted-food-availability-clean/meat/canned-s

In [73]:
dirname = 'food'
file = "asdf.csv"

path = os.path.join(dirname, file)
path

'food/asdf.csv'