# Module 6 EDA Project - Coffee Quality Database

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn

In [2]:
# Load the datasets
arabica_data_raw = pd.read_csv('data/arabica_ratings_raw.csv')
robusta_data_raw = pd.read_csv('data/robusta_ratings_raw.csv')

In [None]:
arabica_data_raw.head(1)

In [None]:
# Drop columns where all values are NaN
arabica_data_drop_nan = arabica_data_raw.dropna(axis=1, how='all')
robusta_data_drop_nan = robusta_data_raw.dropna(axis=1, how='all')
arabica_data_drop_nan.head(3)
robusta_data_drop_nan.head(3)

In [None]:
# Display the arabica_data_cleaned column headers
arabica_data_drop_nan.columns.tolist()

In [None]:
# Display the robusta_data_cleaned column headers
robusta_data_drop_nan.columns.tolist()

In [None]:
# Create a dictionary mapping the robusta column names to the arabica column names
robusta_column_mapping = {
    'Fragrance / Aroma': 'Aroma',
    'Salt / Acid': 'Acidity',  # Rename Salt / Acid to Acidity
    'Uniform Cup': 'Uniformity',
    'Bitter / Sweet': 'Sweetness'
}

# Rename the columns and add the 'Body' column in one step
robusta_data_drop_nan = (robusta_data_drop_nan
                         .rename(columns=robusta_column_mapping)
                         .assign(Body=np.nan))

robusta_data_drop_nan

In [17]:
# Specify which columns to keep in cleaned DataFrame
columns_to_keep = [
 'quality_score',
 'Species',
 'Owner',
 'Country of Origin',
 'Farm Name',
 'Lot Number',
 'Mill',
 'Company',
 'Altitude',
 'Region',
 'Producer',
 'Number of Bags',
 'Bag Weight',
 'In-Country Partner',
 'Harvest Year',
 'Grading Date',
 'Owner.1',
 'Variety',
 'Processing Method',
 'Aroma',
 'Flavor',
 'Aftertaste',
 'Acidity',
 'Body',
 'Balance',
 'Uniformity',
 'Clean Cup',
 'Sweetness',
 'Cupper Points',
 'Total Cup Points',
 'Moisture',
 'Category One Defects',
 'Quakers',
 'Color',
 'Category Two Defects',
 'Expiration',
 'Certification Body'
]

In [None]:
# Select the columns from both DataFrames
arabica_selected = arabica_data_drop_nan[columns_to_keep]
robusta_selected = robusta_data_drop_nan[columns_to_keep]

# Concatenate the DataFrames
combined_coffee_data = pd.concat([arabica_selected, robusta_selected], axis=0, ignore_index=True)
combined_coffee_data.head(5)

In [None]:
# Query the combined_coffee_data dataframe to show only the Robusta species rows
combined_coffee_data.query('Species == "Robusta"').head()

## Note:
While cleaning the column headers, I noticed there was a difference between the arabica and robusta columns. 
The arabica scores were:
Aroma	Flavor	Aftertaste	Acidity	Body	Balance		Uniformity	Clean Cup	Sweetness	Cupper Points

and the Robusta scores were:
Fragrance / Aroma	Flavor	Aftertaste	Salt / Acid	Bitter / Sweet	Mouthfeel	Uniform Cup	Clean Cup	Balance	Cupper Points

So I made the decision to rename some of the Robusta scores columns

In [None]:
# Specify the desired column order in a list
desired_order = [

    # Coffee Details
    'Species', 'Variety','Country of Origin', 'Region', 'Altitude', 'Processing Method', 'Harvest Year', 
    'Expiration',
    
    # General Information
    'Farm Name', 'Lot Number',
    'Number of Bags', 'Bag Weight', 'Producer', 'Mill',
    'Owner', 'Company', 'In-Country Partner',

    # Defects and Quality
    'Moisture', 'Category One Defects', 'Category Two Defects', 
    'Quakers', 'Color',

    # Cupping Scores
    'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 
    'Uniformity', 'Clean Cup', 'Sweetness', 'Cupper Points', 'Total Cup Points',
    'quality_score', 'Grading Date', 'Certification Body'
]

# Reorder the DataFrame columns
combined_coffee_data_reordered = combined_coffee_data[desired_order]

# Set pandas to display all columns
pd.set_option('display.max_columns', None)  # Set to None to show all columns

# Display the reordered DataFrame
combined_coffee_data_reordered.head(25)