In [5]:
import pandas as pd
import numpy as np
import os
from multiprocessing import Pool, cpu_count
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

In [6]:
def nan_score(df):
    return np.mean(np.mean(pd.notna(df)))


def nan_sanitization(df):
    """Drop iteratively columns and rows at step of <step> percent until the dataset has ration (non null values)/(all values) >= <percentage>"""
    iteration = 0
    percentage = 0.70
    step = 0.01
    while nan_score(df) < percentage:
        if (iteration % 2) == 0:
            df = df.drop(columns=df.columns[np.mean(pd.notna(df)) < step])
        else:
            df = df.drop(df.index[np.mean(pd.notna(df), axis=1) < step])
        iteration += 1
        step *= 1.001
        
    return df

def vitamins_norm(column_name):
    return column_name.replace("vit.", "vitamina ")

def single_space_norm(column_name):
    return column_name.replace("  ", " ")

def apply_column_naming_norms(column_name):
    norms = [
        vitamins_norm,
        single_space_norm
    ]
    for norm in norms:
        column_name = norm(column_name)
    return column_name

def apply_index_naming_norms(column_name):
    norms = [
        single_space_norm
    ]
    for norm in norms:
        column_name = norm(column_name)
    return column_name
    
def sanitize(args):
    path, df = args
    df = nan_sanitization(df)
    df.index = df.index.str.lower()
    df.columns = df.columns.str.lower()
    df.index = [apply_column_naming_norms(i) for i in df.index]
    df.columns = [apply_index_naming_norms(c) for c in df.columns]
    df.index.name = "name"
    df.to_csv(path)

In [7]:
dataframes = [("../sanitized_csv/{csv}".format(path=path, csv=csv),
               pd.read_csv(
                   "{path}/{csv}".format(path=path, csv=csv),
                   index_col="name"))
              for path, dirs, csvs in os.walk("../csv/") for csv in csvs
              if not csv.startswith(".")]

In [8]:
with Pool(cpu_count()) as p:
    list(tqdm(p.imap(sanitize, dataframes), total = len(dataframes)))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


