In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
## read the file
arabica_coffee = pd.read_csv('arabica_data_cleaned.csv')

In [None]:
def see_shape(df:pd.DataFrame)->tuple:
    """this function returns a tuple containing the shape of the dataframe and
    the list of columns contained in the dataframe"""
    shape = df.shape
    cols_list = df.columns
    return shape, cols_list

In [4]:
## list of columns to be dropped
dropped_cols = ['Unnamed: 0', 'Species', 'Owner', 'Lot.Number', 'ICO.Number', 'Company', 'Altitude',
                'Producer', 'Bag.Weight', 'In.Country.Partner', 'Grading.Date', 'Owner.1',
                'Clean.Cup', 'Uniformity', 'Total.Cup.Points', 'Moisture', 'Category.One.Defects',
                'Quakers', 'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Address',
                'Certification.Contact', 'unit_of_measurement', 'altitude_low_meters', 'altitude_high_meters']


def drop_cols(df:pd.DataFrame, cols:list)->pd.DataFrame:
    """this function drops unwanted columns from a dataframe and returns
    a dataframe with the remaining columns"""
    df.drop(cols, axis='columns', inplace=True)
    return df

In [5]:
arabica_coffee.isnull().sum()

Unnamed: 0                  0
Species                     0
Owner                       7
Country.of.Origin           1
Farm.Name                 356
Lot.Number               1041
Mill                      310
ICO.Number                146
Company                   209
Altitude                  223
Region                     57
Producer                  230
Number.of.Bags              0
Bag.Weight                  0
In.Country.Partner          0
Harvest.Year               47
Grading.Date                0
Owner.1                     7
Variety                   201
Processing.Method         152
Aroma                       0
Flavor                      0
Aftertaste                  0
Acidity                     0
Body                        0
Balance                     0
Uniformity                  0
Clean.Cup                   0
Sweetness                   0
Cupper.Points               0
Total.Cup.Points            0
Moisture                    0
Category.One.Defects        0
Quakers   

In [6]:
def drop_nulls(df:pd.DataFrame)->pd.DataFrame:
    """this function drops missing along the rows of
    the dataframe. It returns a dataframe free of missing values"""
    df.dropna(axis='rows', inplace=True)
    return df

In [7]:
def rename_cols(df:pd.DataFrame)->pd.DataFrame:
    df.columns = [ name.lower().replace('.', '_') for name in df.columns]
    return df   

In [None]:
type_dict = {'country_of_origin':'string_', 'farm_name':'string_', 'mill':'string_', 'region':'string_',
             'number_of_bags':'int16', 'harvest_year':'int16', 'variety':'category', 'processing_method':'category',
             'aroma':'float16', ''
    
}

def tweak_dtypes(df:pd.DataFrame, type_dict:dict)->pd.DataFrame:
    df.astype(type_dict)

In [8]:
arabica_coffee = arabica_coffee.pipe(drop_cols, dropped_cols).pipe(drop_nulls).pipe(rename_cols)

In [9]:
arabica_coffee.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 743 entries, 1 to 1309
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   country_of_origin     743 non-null    object 
 1   farm_name             743 non-null    object 
 2   mill                  743 non-null    object 
 3   region                743 non-null    object 
 4   number_of_bags        743 non-null    int64  
 5   harvest_year          743 non-null    object 
 6   variety               743 non-null    object 
 7   processing_method     743 non-null    object 
 8   aroma                 743 non-null    float64
 9   flavor                743 non-null    float64
 10  aftertaste            743 non-null    float64
 11  acidity               743 non-null    float64
 12  body                  743 non-null    float64
 13  balance               743 non-null    float64
 14  sweetness             743 non-null    float64
 15  cupper_points         