In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
## read the file
arabica_coffee = pd.read_csv('https://raw.githubusercontent.com/Leroywrld/Monitoring-Coffee-Quality/main/arabica_data_cleaned.csv')

In [3]:
def see_shape(df:pd.DataFrame)->tuple:
    """this function returns a tuple containing the shape of the dataframe and
    the list of columns contained in the dataframe"""
    shape = df.shape
    cols_list = df.columns
    return shape, cols_list
see_shape(arabica_coffee)  

((1311, 44),
 Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
        'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
        'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
        'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
        'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
        'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
        'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
        'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
        'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
        'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
       dtype='object'))

In [4]:
## list of columns to be dropped
dropped_cols = ['Unnamed: 0', 'Species', 'Owner', 'Lot.Number', 'ICO.Number', 'Company', 'Altitude',
                'Producer', 'Bag.Weight', 'In.Country.Partner', 'Grading.Date', 'Owner.1',
                'Clean.Cup', 'Uniformity', 'Total.Cup.Points', 'Moisture', 'Category.One.Defects',
                'Quakers', 'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Address',
                'Certification.Contact', 'unit_of_measurement', 'altitude_low_meters', 'altitude_high_meters']


def drop_cols(df:pd.DataFrame, cols:list)->pd.DataFrame:
    """this function drops unwanted columns from a dataframe and returns
    a dataframe with the remaining columns"""
    df.drop(cols, axis='columns', inplace=True)
    return df

In [5]:
arabica_coffee.isnull().sum()

Unnamed: 0                  0
Species                     0
Owner                       7
Country.of.Origin           1
Farm.Name                 356
Lot.Number               1041
Mill                      310
ICO.Number                146
Company                   209
Altitude                  223
Region                     57
Producer                  230
Number.of.Bags              0
Bag.Weight                  0
In.Country.Partner          0
Harvest.Year               47
Grading.Date                0
Owner.1                     7
Variety                   201
Processing.Method         152
Aroma                       0
Flavor                      0
Aftertaste                  0
Acidity                     0
Body                        0
Balance                     0
Uniformity                  0
Clean.Cup                   0
Sweetness                   0
Cupper.Points               0
Total.Cup.Points            0
Moisture                    0
Category.One.Defects        0
Quakers   

In [6]:
def drop_nulls(df:pd.DataFrame)->pd.DataFrame:
    """this function drops missing along the rows of
    the dataframe. It returns a dataframe free of missing values"""
    df.dropna(axis='rows', inplace=True)
    return df

In [7]:
def rename_cols(df:pd.DataFrame)->pd.DataFrame:
    df.columns = [ name.lower().replace('.', '_') for name in df.columns]
    return df   

In [8]:
def alter_text(df:pd.DataFrame)->pd.DataFrame:
  return df.assign(farm_name = lambda df:df['farm_name'].str.capitalize().astype('string'),
                   region = lambda df:df['region'].str.capitalize().astype('string'),
                   mill = lambda df:df['mill'].str.capitalize().str.replace('.', ' ').astype('string'),
                   country_of_origin = lambda df:df['country_of_origin'].str.replace('Tanzania, United Republic Of', 'Tanzania').astype('string'),
                   harvest_year = lambda df:df['harvest_year'].str.split('/', 1, expand=True)[0].astype('int16'),
                   variety = lambda df:df['variety'].astype('string'),
                   certification_body = lambda df:df['certification_body'].astype('string'),
                   processing_method = lambda df:df['processing_method'].astype('category')
                   )

In [9]:
def alter_numeric(df:pd.DataFrame)->pd.DataFrame:
  return df.assign(number_of_bags = lambda df:df['number_of_bags'].astype('int16'),
                   aroma = lambda df:df['aroma'].astype('float16'),
                   flavor = lambda df:df['flavor'].astype('float16'),
                   aftertaste = lambda df:df['aftertaste'].astype('float16'),
                   acidity = lambda df:df['acidity'].astype('float16'),
                   body = lambda df:df['body'].astype('float16'),
                   balance = lambda df:df['balance'].astype('float16'),
                   sweetness = lambda df:df['sweetness'].astype('float16'),
                   cupper_points = lambda df:df['cupper_points'].astype('float16'),
                   altitude_mean_meters = lambda df:df['altitude_mean_meters'].astype('float16'))

In [None]:
def sort_reset()

In [10]:
arabica_coffee = arabica_coffee.pipe(drop_cols, dropped_cols).pipe(drop_nulls).pipe(rename_cols
                                                                                    ).pipe(alter_text).pipe(alter_numeric)

In [11]:
arabica_coffee.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 743 entries, 1 to 1309
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   country_of_origin     743 non-null    string  
 1   farm_name             743 non-null    string  
 2   mill                  743 non-null    string  
 3   region                743 non-null    string  
 4   number_of_bags        743 non-null    int16   
 5   harvest_year          743 non-null    int16   
 6   variety               743 non-null    string  
 7   processing_method     743 non-null    category
 8   aroma                 743 non-null    float16 
 9   flavor                743 non-null    float16 
 10  aftertaste            743 non-null    float16 
 11  acidity               743 non-null    float16 
 12  body                  743 non-null    float16 
 13  balance               743 non-null    float16 
 14  sweetness             743 non-null    float16 
 15  cuppe

In [12]:
arabica_coffee

Unnamed: 0,country_of_origin,farm_name,mill,region,number_of_bags,harvest_year,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,sweetness,cupper_points,certification_body,altitude_mean_meters
1,Ethiopia,Metad plc,Metad plc,Guji-hambela,300,2014,Other,Washed / Wet,8.750000,8.671875,8.500000,8.578125,8.421875,8.421875,10.000000,8.578125,METAD Agricultural Development plc,2076.0
4,Ethiopia,Metad plc,Metad plc,Guji-hambela,300,2014,Other,Washed / Wet,8.250000,8.500000,8.250000,8.500000,8.421875,8.328125,10.000000,8.578125,METAD Agricultural Development plc,2076.0
9,Ethiopia,Tulla coffee farm,Tulla coffee farm,"Snnp/kaffa zone,gimbowereda",50,2014,Other,Natural / Dry,8.078125,8.578125,8.500000,8.500000,7.671875,8.421875,10.000000,8.500000,METAD Agricultural Development plc,1822.0
18,China,Echo coffee,Echo coffee mill,Yunnan,3,2015,Catimor,Washed / Wet,8.421875,8.250000,8.078125,8.171875,7.921875,8.000000,10.000000,8.421875,Yunnan Coffee Exchange,1450.0
19,Ethiopia,Drima zede,Drima zede,Gedio,250,2014,Ethiopian Yirgacheffe,Natural / Dry,8.171875,8.171875,8.000000,8.171875,8.078125,8.328125,10.000000,8.328125,Blossom Valley International,1850.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,Honduras,Cerro bueno,Cadexsa,Marcala,275,2014,Catuai,Washed / Wet,7.000000,6.328125,6.171875,6.500000,6.671875,6.171875,8.000000,6.328125,Instituto Hondureño del Café,1450.0
1306,Mexico,El centenario,"La esperanza, municipio juchique de ferrer, ve...",Juchique de ferrer,12,2012,Bourbon,Washed / Wet,7.078125,6.828125,6.250000,7.421875,7.250000,6.750000,10.000000,6.750000,AMECAFE,900.0
1307,Haiti,200 farms,Coeb koperativ ekselsyo basen (350 members),"Department d'artibonite , haiti",1,2012,Typica,Natural / Dry,6.750000,6.578125,6.421875,6.671875,7.078125,6.671875,6.000000,6.421875,Specialty Coffee Association,350.0
1308,Nicaragua,Finca las marías,Beneficio atlantic condega,Jalapa,550,2016,Caturra,Other,7.250000,6.578125,6.328125,6.250000,6.421875,6.078125,6.000000,6.171875,Instituto Hondureño del Café,1100.0
