In [1]:
import pandas as pd
import numpy as np

# Visualization
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # to be able to display all columns of the DataFrames

In [2]:
df = pd.read_csv("data/coffee.csv")

In [3]:
df.head(20)

Unnamed: 0,REC_ID,Species,Continent.of.Origin,Country.of.Origin,Harvest.Year,Expiration,Variety,Color,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Moisture,Quakers,Category.One.Defects,Category.Two.Defects
0,0,Arabica,Africa,Ethiopia,2014.0,04/03/16,,Green,Washed / Wet,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,0.12,0,0,0
1,1,Arabica,Africa,Ethiopia,2014.0,04/03/16,Other,Green,Washed / Wet,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,0.12,0,0,1
2,2,Arabica,North America,Guatemala,,05/31/11,Bourbon,,,8.42,8.5,8.42,8.42,8.33,8.42,10.0,10.0,10.0,0.0,0,0,0
3,3,Arabica,Africa,Ethiopia,2014.0,03/25/16,,Green,Natural / Dry,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,0.11,0,0,2
4,4,Arabica,Africa,Ethiopia,2014.0,04/03/16,Other,Green,Washed / Wet,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,0.12,0,0,2
5,5,Arabica,South America,Brazil,2013.0,09/03/14,,Bluish-Green,Natural / Dry,8.58,8.42,8.42,8.5,8.25,8.33,10.0,10.0,10.0,0.11,0,0,1
6,6,Arabica,South America,Peru,2012.0,09/17/13,Other,Bluish-Green,Washed / Wet,8.42,8.5,8.33,8.5,8.25,8.25,10.0,10.0,10.0,0.11,0,0,0
7,7,Arabica,Africa,Ethiopia,2010.0,09/02/11,,,,8.25,8.33,8.5,8.42,8.33,8.5,10.0,10.0,9.33,0.03,0,0,0
8,8,Arabica,Africa,Ethiopia,2010.0,09/02/11,,,,8.67,8.67,8.58,8.42,8.33,8.42,9.33,10.0,9.33,0.03,0,0,0
9,9,Arabica,Africa,Ethiopia,2014.0,03/29/16,Other,Green,Natural / Dry,8.08,8.58,8.5,8.5,7.67,8.42,10.0,10.0,10.0,0.1,0,0,4


In [4]:
# Cantidad de filas y columnas:
print(f"El número de filas que tenemos es {df.shape[0]}, y el número de columnas es {df.shape[1]}.")

El número de filas que tenemos es 1339, y el número de columnas es 22.


In [5]:
# Function to view the complete Dataframe information:
def get_csv_info(csv):
   
    print("CSV Information:\n")
    print("-----------------------------------------------------------------------")
    df.info()
    print("-----------------------------------------------------------------------")
    
    print("\nColumn Names:")
    print("-----------------------------------------------------------------------")
    print(df.columns)
    print("-----------------------------------------------------------------------")
    
    return 

# Function to view duplicate and null values in the Dataframe.
def check_duplicates_and_nulls(csv):
    # Check for duplicates
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate_count}")
    print("-----------------------------------------------------------------------")
    
    # Check for null values
    print("\nNumber of null values per column:")
    print("-----------------------------------------------------------------------")
    print(df.isna().sum())
    print("-----------------------------------------------------------------------")
    print(f"This makes a tota of {df.isna().sum().sum()} null values.")

    return

# Function to view the descriptions of the numerical and categorical columns of the Dataframe.
def describe_dataframe(csv):
    # Describe numerical columns
    print("Numerical Columns Description:\n")
    print("-----------------------------------------------------------------------")
    print(df.describe().T)
    print("-----------------------------------------------------------------------")
    
    # Describe categorical columns
    print("\nCategorical Columns Description:\n")
    print("-----------------------------------------------------------------------")
    print(df.describe(include=['object']).T)
    print("-----------------------------------------------------------------------")
    
    return 

# Function to analyze the categorical columns of the dataframe.
def analyze_categorical_columns(csv):
    # Select categorical columns
    df_cat = df.select_dtypes(include="object")
    
    # Get the names of the categorical columns
    categorical_columns = df_cat.columns
    print(f"The categorical columns in the DataFrame are:\n {categorical_columns}")
    
    # Iterate over each categorical column to display unique values and their frequencies
    for column in categorical_columns:
        print(f"\n----------- ANALYZING THE COLUMN: '{column.upper()}' -----------\n")
        print(f"Unique values: {df_cat[column].unique()}\n")
        print(f"Frequencies of unique values:\n{df_cat[column].value_counts()}\n")
        print("-----------------------------------------------------------------------")

# Function to find the rows that have ALL nulls of the dataframe.
def find_all_null_rows(csv):
    # Identify rows where all values are null
    all_null_rows = df[df.isnull().all(axis=1)]
    
    # Display the rows with all null values
    print("Rows with all null values:\n")
    print("-----------------------------------------------------------------------")
    print(all_null_rows)
    print("-----------------------------------------------------------------------")
    
    # Optionally, get the indices of these rows
    all_null_indices = all_null_rows.index
    print("\nIndices of rows with all null values:\n", all_null_indices.tolist())
    
    return 

# Function to find negative numbers in columns of the dataframe.
def find_negative_values(csv):
    # Select numerical columns
    num_cols_df = df.select_dtypes(include=["number"])
    
    # Initialize a list to store column names with negative values
    negative_columns = []
    
    # Iterate through each numerical column to check for negative values
    for col in num_cols_df.columns:
        if (num_cols_df[col] < 0).any():
            negative_columns.append(col)
    
    # Display columns with negative values
    print("Columns with negative values:", negative_columns)
    
    # Optionally, count the negative values in a specific column, e.g., 'adr'
    if 'adr' in df.columns:
        negative_count = (df['adr'] < 0).sum()
        print(f"Number of negative values in 'adr': {negative_count}")
    
    return


def find_zero_values(csv):
    # Select numerical columns
    num_cols_df = df.select_dtypes(include=["number"])
    
    # Initialize a list to store column names with zero values
    zero_columns = []
    
    # Iterate through each numerical column to check for zero values
    for col in num_cols_df.columns:
        if (num_cols_df[col] == 0).any():
            zero_columns.append(col)
    
    # Display columns with zero values
    print("Columns with zero values:", zero_columns)
    print("-----------------------------------------------------------------------")
    
    # Optionally, display the count of zero values in each column
    for col in zero_columns:
        dtype = num_cols_df[col].dtype
        zero_count = (df[col] == 0).sum()
        print(f"Number of zero values in '{col}':\n {zero_count}\n")
        print(f"The type of the value is: {dtype}")
        print("-----------------------------------------------------------------------")
    
    return

In [7]:
get_csv_info(df)

CSV Information:

-----------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   REC_ID                1339 non-null   int64  
 1   Species               1339 non-null   object 
 2   Continent.of.Origin   1338 non-null   object 
 3   Country.of.Origin     1338 non-null   object 
 4   Harvest.Year          1279 non-null   float64
 5   Expiration            1339 non-null   object 
 6   Variety               1113 non-null   object 
 7   Color                 1069 non-null   object 
 8   Processing.Method     1169 non-null   object 
 9   Aroma                 1339 non-null   float64
 10  Flavor                1339 non-null   float64
 11  Aftertaste            1339 non-null   float64
 12  Acidity               1339 non-null   float64
 13  Body                  1339 non-nu

In [9]:
check_duplicates_and_nulls(df)

Number of duplicate rows: 1
-----------------------------------------------------------------------

Number of null values per column:
-----------------------------------------------------------------------
Species                   0
Continent.of.Origin       1
Country.of.Origin         1
Harvest.Year             60
Expiration                0
Variety                 226
Color                   270
Processing.Method       170
Aroma                     0
Flavor                    0
Aftertaste                0
Acidity                   0
Body                      0
Balance                   0
Uniformity                0
Clean.Cup                 0
Sweetness                 0
Moisture                  0
Quakers                   0
Category.One.Defects      0
Category.Two.Defects      0
dtype: int64
-----------------------------------------------------------------------
This makes a tota of 728 null values.


In [25]:
describe_dataframe(df)

Numerical Columns Description:

-----------------------------------------------------------------------
                       count         mean       std     min      25%      50%  \
Harvest.Year          1279.0  2013.567631  1.808496  2009.0  2012.00  2013.00   
Aroma                 1339.0     7.566706  0.377560     0.0     7.42     7.58   
Flavor                1339.0     7.520426  0.398442     0.0     7.33     7.58   
Aftertaste            1339.0     7.401083  0.404463     0.0     7.25     7.42   
Acidity               1339.0     7.535706  0.379827     0.0     7.33     7.58   
Body                  1339.0     7.517498  0.370064     0.0     7.33     7.50   
Balance               1339.0     7.518013  0.408943     0.0     7.33     7.50   
Uniformity            1339.0     9.834877  0.554591     0.0    10.00    10.00   
Clean.Cup             1339.0     9.835108  0.763946     0.0    10.00    10.00   
Sweetness             1339.0     9.856692  0.616102     0.0    10.00    10.00   
Moist

In [26]:
analyze_categorical_columns(df)

The categorical columns in the DataFrame are:
 Index(['Species', 'Continent.of.Origin', 'Country.of.Origin', 'Expiration',
       'Variety', 'Color', 'Processing.Method'],
      dtype='object')

----------- ANALYZING THE COLUMN: 'SPECIES' -----------

Unique values: ['Arabica' 'Robusta']

Frequencies of unique values:
Species
Arabica    1311
Robusta      28
Name: count, dtype: int64

-----------------------------------------------------------------------

----------- ANALYZING THE COLUMN: 'CONTINENT.OF.ORIGIN' -----------

Unique values: ['Africa' 'North America' 'South America' 'Asia' 'Oceania' nan]

Frequencies of unique values:
Continent.of.Origin
North America    665
South America    328
Asia             182
Africa           162
Oceania            1
Name: count, dtype: int64

-----------------------------------------------------------------------

----------- ANALYZING THE COLUMN: 'COUNTRY.OF.ORIGIN' -----------

Unique values: ['Ethiopia' 'Guatemala' 'Brazil' 'Peru' 'United States

In [27]:
find_all_null_rows(df)

Rows with all null values:

-----------------------------------------------------------------------
Empty DataFrame
Columns: [Species, Continent.of.Origin, Country.of.Origin, Harvest.Year, Expiration, Variety, Color, Processing.Method, Aroma, Flavor, Aftertaste, Acidity, Body, Balance, Uniformity, Clean.Cup, Sweetness, Moisture, Quakers, Category.One.Defects, Category.Two.Defects]
Index: []
-----------------------------------------------------------------------

Indices of rows with all null values:
 []


In [28]:
find_negative_values(df)

Columns with negative values: []


In [29]:
find_zero_values(df)

Columns with zero values: ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Moisture', 'Quakers', 'Category.One.Defects', 'Category.Two.Defects']
-----------------------------------------------------------------------
Number of zero values in 'Aroma':
 1

The type of the value is: float64
-----------------------------------------------------------------------
Number of zero values in 'Flavor':
 1

The type of the value is: float64
-----------------------------------------------------------------------
Number of zero values in 'Aftertaste':
 1

The type of the value is: float64
-----------------------------------------------------------------------
Number of zero values in 'Acidity':
 1

The type of the value is: float64
-----------------------------------------------------------------------
Number of zero values in 'Body':
 1

The type of the value is: float64
-----------------------------------------------------------------------
