In [1]:
import pandas as pd
import numpy as np
import os
from multiprocessing import Pool, cpu_count
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

In [49]:
def nan_score(df):
    return np.mean(np.mean(pd.notna(df)))


def nan_sanitization(df):
    """Drop iteratively columns and rows at step of <step> percent until the dataset has ration (non null values)/(all values) >= <percentage>"""
    iteration = 0
    percentage = 0.80
    step = 0.01
    while nan_score(df) < percentage:
        if (iteration % 2) == 0:
            df = df.drop(columns=df.columns[np.mean(pd.notna(df)) < step])
        else:
            df = df.drop(df.index[np.mean(pd.notna(df), axis=1) < step])
        iteration += 1
        step *= 1.01

    return df


def zero_sanitization(df):
    """Drop zero or nan only columns."""
    ground = np.full(df.shape, True)
    for i, c in enumerate(df.columns):
        if df[c].dtype == "float64":
            ground[:, i] = ~np.isclose(np.array(df[c].values), 0)
    columns = df.columns[np.all(
        np.any([pd.isna(df.values), ~ground], axis=0), axis=0)]
    return df.drop(columns=columns)


def vitamins_norm(column_name):
    return column_name.replace("vit.", "vitamina ")


def single_space_norm(column_name):
    return column_name.replace("  ", " ")


def apply_column_naming_norms(column_name):
    norms = [vitamins_norm, single_space_norm]
    for norm in norms:
        column_name = norm(column_name)
    return column_name


def apply_index_naming_norms(column_name):
    norms = [single_space_norm]
    for norm in norms:
        column_name = norm(column_name)
    return column_name


def sanitize(args):
    path, df = args
    df = zero_sanitization(df)
    df = nan_sanitization(df)
    df.index = df.index.str.lower()
    df.columns = df.columns.str.lower()
    df.index = [apply_index_naming_norms(i) for i in df.index]
    df.columns = [apply_column_naming_norms(c) for c in df.columns]
    df.index.name = "name"
    df.to_csv(path)

In [50]:
dataframes = [("../sanitized_csv/{csv}".format(path=path, csv=csv),
               pd.read_csv(
                   "{path}/{csv}".format(path=path, csv=csv),
                   index_col="name"))
              for path, dirs, csvs in os.walk("../csv/") for csv in csvs
              if csv.endswith(".csv") and "cibo360" not in csv]

In [51]:
with Pool(cpu_count()) as p:
    list(tqdm(p.imap(sanitize, dataframes), total = len(dataframes)))

Index(['vitamina b6 | g'], dtype='object')
Index([], dtype='object')
Index([], dtype='object')


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Index([], dtype='object')
Index(['aminoacido limitante | g', 'indice chimico | g'], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')


In [42]:
import math
def magnitude(value):
    if (value == 0): return 0
    return int(math.floor(math.log10(abs(value))))

magnitude(65), magnitude(230)

(1, 2)