In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Config**

In [2]:
sns.set_theme(style='darkgrid')
title_data = 'Bulk'
paleta = sns.color_palette("tab10").as_hex()

# **Functions**

In [3]:
def porcentaje_nulos_x_caracteristica(data:pd.DataFrame, **kwargs)->None:
  """Calcular el porcentaje de valores nulos por caracteristicas"""
  data.isnull().melt().pipe(
        lambda df: (
            sns.displot(
                data=df,
                y='variable',
                hue='value',
                multiple='fill',
                aspect=2
            ).set(**kwargs)
        )
    )
  
def null_features_per_record(data, figsize=(9,8), **kwargs):
  """Identifica el número de caracteristicas nulas por registro"""
  plt.figure(figsize=figsize)
  (
      data
      .isnull()
      .transpose()
      .pipe(
          lambda df: (
              sns.heatmap(
                  data = df
                  
              ).set(**kwargs)
          )
      )
  )

**Data**
---

In [4]:
url_data_bulk = 'https://drive.google.com/file/d/1WqFElWzLW9iGsnHfuDxZK9Wm9Y5ctK0I/view?usp=sharing'
url_data_bulk = 'https://drive.google.com/uc?id=' + url_data_bulk.split('/')[-2]
data_bulk = pd.read_csv(url_data_bulk)

In [5]:
data_bulk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3129 entries, 0 to 3128
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   key      3129 non-null   int64  
 1   Bulk 1   252 non-null    float64
 2   Bulk 2   22 non-null     float64
 3   Bulk 3   1298 non-null   float64
 4   Bulk 4   1014 non-null   float64
 5   Bulk 5   77 non-null     float64
 6   Bulk 6   576 non-null    float64
 7   Bulk 7   25 non-null     float64
 8   Bulk 8   1 non-null      float64
 9   Bulk 9   19 non-null     float64
 10  Bulk 10  176 non-null    float64
 11  Bulk 11  177 non-null    float64
 12  Bulk 12  2450 non-null   float64
 13  Bulk 13  18 non-null     float64
 14  Bulk 14  2806 non-null   float64
 15  Bulk 15  2248 non-null   float64
dtypes: float64(15), int64(1)
memory usage: 391.2 KB


In [6]:
data_bulk.shape

(3129, 16)

- -There are 3129 batches of bulk-

In [None]:
data_bulk.head()

In [None]:
porcentaje_nulos_x_caracteristica(data_bulk, title='Null values percentage')

- Hay un porcentaje grande de valores nulos en la mayoría de columnas, menos a los lotes. 
- -There are many null values in all functions, except in the key column 'batches'-

- Falta la mayoría de los datos en las columnas Bulk 1, Bulk 2, Bulk 5, Bulk 7, Bulk 8, Bulk 9, Bulk 10, Bulk 11, Bulk 13, con un porcentaje superior al 90%.
- -Most of data in columns Bulk 1, Bulk 2, Bulk 5, Bulk 7, Bulk 8, Bulk 9, Bulk 10, Bulk 11, Bulk 13 is missing, with a percentage above 90%.-

- Faltan datos en las columnas Bulk 3, Bulk 4, Bulk 6 con un porcentaje entre 50% - 80%
- -There are missing data in columns Bulk 3, Bulk 4, Bulk 6 with a percentage between 50% - 80%-

- Faltan datos en las columnas Bulk 12 y Bulk 15 con un porcentaje entre 21% y 28%.
- -There are missing data in columns Bulk 12 and Bulk 15 with a percentage between 21% - 28%-

- Bulk 14 tiene la menor cantidad de datos faltantes con un porcentaje del 10%.
- -Bulk 14 has the lowest amount of missing data with a percentage of 10%.-


In [None]:
null_features_per_record(data_bulk, title='Null values for rows')

- Los lotes pueden tener más de un tipo de granel, no necesariamente los 15.
- -For batches, there can be more than one type of bulk-

In [None]:
data_bulk['key'].duplicated().sum()

- -There are no duplicate data-