# Prepare the data for analysis

In [5]:
##This notebook includes acidity_structure column which reduces the number of rows 

In [6]:
# Import Modules
import pandas as pd
import numpy as np 
from pathlib import Path
import re
from datetime import date
from easy_exchange_rates import API

# Display all of the columns
pd.set_option('display.max_columns', None)

In [7]:
# Read the CSV file into a Pandas DataFrame
#read the CSV == source - https://www.kaggle.com/datasets/patkle/coffeereviewcom-over-7000-ratings-and-reviews
csv_data = "reviews_feb_2023.csv"
coffee_df = pd.read_csv(csv_data)
coffee_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,with_milk,agtron,blind_assessment,bottom_line,coffee_origin,est_price,notes,review_date,roast_level,roaster,roaster_location,url
0,Bolivia Manantial Gesha,93,9.0,8.0,9,8,9,,60/78,"Richly aromatic, floral-toned. Magnolia, cocoa...",This washed Boliva Gesha has all the aromatics...,"Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Jan-23,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",https://www.coffeereview.com/review/bolivia-ma...
1,Yellow Pacamara Carbonic Maceration Nanolot,92,8.0,8.0,9,8,9,,60/78,"Crisply sweet-savory. White peach, hop flowers...",A carbonic-macerated Yellow Pacamara grown in ...,"Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Dec-22,Medium-Light,Eccentricity Coffee Co.,"Cleveland, Ohio",https://www.coffeereview.com/review/brix-break...
2,Ethiopia Gera Genji Challa,94,9.0,8.0,9,9,9,,59/77,"Delicately aromatic, complex. Lilac, cocoa nib...","A nuanced, multilayered washed Ethiopia cup wi...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Dec-22,Medium-Light,Mostra Coffee,"San Diego, California",https://www.coffeereview.com/review/ethiopia-g...
3,Yirgacheffe Mengesha Natural,94,9.0,8.0,9,9,9,,60/77,"High-toned, fruit-driven. Boysenberry, pear, c...",A fruit medley in a cup — think boysenberry an...,"Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Nov-22,Medium-Light,Regent Coffee,"Glendale, California",https://www.coffeereview.com/review/yirgacheff...
4,Tropical Summer Colombia La Sierra,93,9.0,8.0,9,8,9,,60/77,"Fruit-driven, crisply chocolaty. Goji berry, d...","An experimentally processed Colombia, sweetly ...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Nov-22,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",https://www.coffeereview.com/review/tropical-s...


## Clean the dataset
- Drop duplicates, null values, and unnecassary columns
- Split the location column

In [8]:
#drop the unneccessary columns 'with_milk','agtron','roaster','url' and high null value cols which are 'acidity_structure', 'bottom_line'
coffee1_df = coffee_df.drop(columns=['with_milk','agtron','roaster','url','bottom_line', "review_date"])
coffee1_df.head()
coffee1_df.index  

RangeIndex(start=0, stop=7041, step=1)

In [9]:
#check for null values
coffee1_df.isnull().sum()

title                   0
rating                  0
acidity_structure    4875
aftertaste            872
aroma                  50
body                   11
flavor                 16
blind_assessment        1
coffee_origin         505
est_price            2039
notes                   8
roast_level           374
roaster_location        3
dtype: int64

In [10]:
#removed columns and null values to show 2125 rows × 13 columns with no null values
coffee2_df = coffee1_df.dropna(subset=['acidity_structure','aftertaste', 'aroma', 'body', 'flavor', 'blind_assessment',
                                       'coffee_origin', 'est_price', 'notes', 'roast_level', 'roaster_location'])
coffee2_df = coffee2_df.drop_duplicates()

coffee2_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,roaster_location
0,Bolivia Manantial Gesha,93,9.0,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,"Floyd, Virginia"
1,Yellow Pacamara Carbonic Maceration Nanolot,92,8.0,8.0,9,8,9,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,"Cleveland, Ohio"
2,Ethiopia Gera Genji Challa,94,9.0,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,"San Diego, California"
3,Yirgacheffe Mengesha Natural,94,9.0,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,"Glendale, California"
4,Tropical Summer Colombia La Sierra,93,9.0,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,"Harrisonburg, Virginia"


In [11]:
#split roaster location
coffee2_df[['city', 'state']] = coffee2_df['roaster_location'].str.split(',', n=1, expand=True)
coffee3_df = coffee2_df.drop(columns=['roaster_location'])


# coffee3_df.reset_index().head()
coffee3_df.head()


Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,city,state
0,Bolivia Manantial Gesha,93,9.0,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia
1,Yellow Pacamara Carbonic Maceration Nanolot,92,8.0,8.0,9,8,9,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio
2,Ethiopia Gera Genji Challa,94,9.0,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California
3,Yirgacheffe Mengesha Natural,94,9.0,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California
4,Tropical Summer Colombia La Sierra,93,9.0,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia


In [12]:
#check for datatypes for any null values and datatypes
display(coffee3_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2125 entries, 0 to 2719
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              2125 non-null   object 
 1   rating             2125 non-null   object 
 2   acidity_structure  2125 non-null   float64
 3   aftertaste         2125 non-null   float64
 4   aroma              2125 non-null   object 
 5   body               2125 non-null   object 
 6   flavor             2125 non-null   object 
 7   blind_assessment   2125 non-null   object 
 8   coffee_origin      2125 non-null   object 
 9   est_price          2125 non-null   object 
 10  notes              2125 non-null   object 
 11  roast_level        2125 non-null   object 
 12  city               2125 non-null   object 
 13  state              2123 non-null   object 
dtypes: float64(2), object(12)
memory usage: 249.0+ KB


None

In [13]:
# drop additional null values after split
coffee3_df = coffee3_df.dropna(subset=['state'])

#change the datatype 
coffee4_df = coffee3_df.astype({"rating":float, "acidity_structure": float, "aftertaste":float ,"aroma":float, "body":float, "flavor":float, "blind_assessment": "string", "coffee_origin": "string", 
                               "notes": "string", "roast_level": "string", "city": "string", "state": "string"})
#check for datatypes for any null values and datatypes
display(coffee4_df.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2123 entries, 0 to 2719
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              2123 non-null   object 
 1   rating             2123 non-null   float64
 2   acidity_structure  2123 non-null   float64
 3   aftertaste         2123 non-null   float64
 4   aroma              2123 non-null   float64
 5   body               2123 non-null   float64
 6   flavor             2123 non-null   float64
 7   blind_assessment   2123 non-null   string 
 8   coffee_origin      2123 non-null   string 
 9   est_price          2123 non-null   object 
 10  notes              2123 non-null   string 
 11  roast_level        2123 non-null   string 
 12  city               2123 non-null   string 
 13  state              2123 non-null   string 
dtypes: float64(6), object(2), string(6)
memory usage: 248.8+ KB


None

## Prepare the est_price column for standardization

In [14]:
# clean up the currency columns 
money_df = coffee4_df.copy()
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,city,state
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia


In [15]:
# split data by ; first to get the first price given
money_df[["dollars", "beans"]]  = money_df['est_price'].str.split(';', expand=True)

# drop excess prices
money_df = money_df.drop(columns = ["beans"])

money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,city,state,dollars
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00/12 ounces
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00/12 ounces
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00/12 ounces
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50/12 ounces
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99/8 ounces


In [16]:
# split data by () next
money_df[["keep", "extra"]] = money_df["dollars"].str.split('(', expand=True)

# drop additional info provided in ()
money_df = money_df.drop(columns = ["extra"])
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,city,state,dollars,keep
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00/12 ounces,$30.00/12 ounces
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00/12 ounces,$160.00/12 ounces
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00/12 ounces,$28.00/12 ounces
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50/12 ounces,$20.50/12 ounces
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99/8 ounces,$18.99/8 ounces


In [17]:
# split data by / to seperate price and units
money_df[["price", "units"]] = money_df["keep"].str.split('/', expand=True)
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,notes,roast_level,city,state,dollars,keep,price,units
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",$30.00/12 ounces,"Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00/12 ounces,$30.00/12 ounces,$30.00,12 ounces
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",$160.00/12 ounces,Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00/12 ounces,$160.00/12 ounces,$160.00,12 ounces
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",$28.00/12 ounces,Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00/12 ounces,$28.00/12 ounces,$28.00,12 ounces
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",$20.50/12 ounces,Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50/12 ounces,$20.50/12 ounces,$20.50,12 ounces
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",$18.99/8 ounces,Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99/8 ounces,$18.99/8 ounces,$18.99,8 ounces


In [18]:
# drop excess columns
money_df = money_df.drop(columns = ['est_price','dollars','keep'])

# rename columns to be more descriptive
money_df = money_df.rename(columns={"price":"currency"})

money_df.head()


Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,units
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00,12 ounces
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00,12 ounces
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00,12 ounces
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50,12 ounces
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99,8 ounces


In [19]:
# reset index following elimination of rows
money_df = money_df.reset_index()

# drop old index column
money_df = money_df.drop(columns = ['index'])
money_df.head()


Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,units
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00,12 ounces
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00,12 ounces
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00,12 ounces
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50,12 ounces
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99,8 ounces


In [20]:
# define a regex pattern to match the numeric value of the currency column
pattern = r"[\d,.]+"

# Use the regular expression pattern to extract the numeric value
money_df['numeric_cost'] = money_df['currency'].str.findall(pattern)

# update column name to be more descriptive
money_df = money_df.rename(columns={'cost':'currency'})
money_df.head()


Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,units,numeric_cost
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,$30.00,12 ounces,[30.00]
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,$160.00,12 ounces,[160.00]
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,$28.00,12 ounces,[28.00]
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,$20.50,12 ounces,[20.50]
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,$18.99,8 ounces,[18.99]


In [21]:
# identify currency unit types in order to create consistent currency codes
money_df['currency'].unique()

array(['$30.00', '$160.00', '$28.00', '$20.50', '$18.99', 'NT $459',
       '$16.00', 'NT$500', 'NT$700', '$8.40', 'NT $520', '$22.00',
       '$26.00', '$17.00', '$24.00', '$17.50', '$25.00', 'NT $450',
       '$19.00', '$17.95', 'NT $550', 'NT $350', 'NT $630', 'NT $880',
       'NT $1250', '$14.00', 'NT $500', 'NT $600', '$20.00', '$21.00',
       '$19.95', '$15.00', '$20.95', '$60.00', '$50.00', '$18.00',
       '$8.00', '$18.50', '$21.50', '$16.99', 'NT $650', '$13.00',
       'NT $400', '$12.00', 'NT $429', 'NT $399', '$28.50', '$54.95',
       'NT $275', 'NT $888', '$18.95', '$17.99', '$80.00', '$39.50',
       '$16.70', 'NT $850', '$16.95', '$32.25', 'NT $300', '$25.50',
       'NT $390', '$49.00', '$29.00', 'NT $750', '$27.00', '$19.50',
       '$47.95', '$21.95', 'NT $680', '$23.50', '$18.05', 'CAD $30.00',
       '$12.50', 'NT $325', 'NT $1200', 'NT$399', 'NT$650', '$NT$520',
       '$NT$1000', 'NT$600', 'NT$275', 'NT$587', 'NT $460', '$23.00',
       'NT $800', '$45.99', '$

In [22]:
# define the regular expression pattern
USD_pattern = r'^\$[\d,.]+'
USD2_pattern = r'.*(US|USD).*' 
CAD_pattern = r'.*CAD.*'
NT_pattern = r'.*(NT|TWD).*' 
HKD_pattern = r'.*(HKD|HK).*' 
JPY_pattern = r'.*¥.*'
MXN_pattern = r'.*pesos.*'
LAK_pattern = r'.*LAK.*'
AUD_pattern = r'.*AUD.*'
MYR_pattern = r'.*MYR.*'
KRW_pattern = r'.*KRW.*'
GBP_pattern = r'.*£.*'
IDR_pattern = r'.*IDR.*'
EUR_pattern = r'.*(€|Euros|#).*' 
EUR2_pattern = r'^E.*' 
THB_pattern = r'.*THB.*'
AED_pattern = r'.*AED.*'
GTQ_pattern = r'.*GTQ.*'

# loop over each value in the 'price' column
for i in range(len(money_df)):
    # for i in money_df["currency"].values:
    # print(i)
    # check if the value matches the pattern and replace with consistent currency code
        if re.match(USD_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'USD'
        elif re.match(USD2_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'USD'
        elif re.match(CAD_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'CAD'
        elif re.match(NT_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'NTD'
        elif re.match(HKD_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'HKD'
        elif re.match(JPY_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'JPY'
        elif re.match(MXN_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'MXN'
        elif re.match(LAK_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'LAK'
        elif re.match(AUD_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'AUD'
        elif re.match(MYR_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'MYR'
        elif re.match(KRW_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'KRW'
        elif re.match(GBP_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'GBP'
        elif re.match(IDR_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'IDR'
        elif re.match(EUR_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'EUR'
        elif re.match(EUR2_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'EUR'
        elif re.match(THB_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'THB'
        elif re.match(AED_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'AED'
        elif re.match(GTQ_pattern, money_df.loc[i, 'currency']):
            money_df.loc[i, 'currency'] = 'GTQ'
        else:
            money_df.drop(i, inplace=True)

In [23]:
# reset index following elimination of rows
money_df = money_df.reset_index()

# drop old index column
money_df = money_df.drop(columns = ['index'])

money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,units,numeric_cost
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,USD,12 ounces,[30.00]
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,USD,12 ounces,[160.00]
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,USD,12 ounces,[28.00]
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,USD,12 ounces,[20.50]
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,USD,8 ounces,[18.99]


In [24]:
# check there are no outlying currency codes
money_df['currency'].unique()

array(['USD', 'NTD', 'CAD', 'HKD', 'GBP', 'MXN', 'JPY', 'IDR', 'AED',
       'AUD', 'KRW', 'EUR'], dtype=object)

### Convert the units to ounces

In [25]:
# # strip the column to eliminate starting/ending spaces
money_df['units'] = money_df['units'].str.strip(' ')

# split units by ' ' first 
money_df[['A','B','C','D']] = money_df['units'].str.split(' ', expand=True)
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,units,numeric_cost,A,B,C,D
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,USD,12 ounces,[30.00],12,ounces,,
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,USD,12 ounces,[160.00],12,ounces,,
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,USD,12 ounces,[28.00],12,ounces,,
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,USD,12 ounces,[20.50],12,ounces,,
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,USD,8 ounces,[18.99],8,ounces,,


In [26]:
# identify unique units of measurement in order to standardize
money_df['B'].unique()

array(['ounces', 'grams', None, 'packet', 'tin', '18-gram', 'pounds',
       'sticks', 'ounces*', '3.3', '5-gram', 'capsules'], dtype=object)

In [27]:
# create lists for similar units of measurement
ounce_list = ['ounces','ounces*','ounces.', 'onces', 'ouncues', 'ounce', 'oz.']
gram_list = ['grams','gram','g','g.']
kg_list = ['kilo','kilogram','kg.','Kilogram']
ml_list = ['ml.','ml']
lb_list = ['pounds','pound','lbs','lb']

# loop over each value to standardize units of measurement
for i in range(len(money_df)):
    if money_df.loc[i, 'B'] in ounce_list:
        money_df.loc[i, 'B'] = 'ounce'
    elif money_df.loc[i, 'B'] in gram_list:
        money_df.loc[i, 'B'] = 'gram'
    elif money_df.loc[i, 'B'] in kg_list:
        money_df.loc[i, 'B'] = 'kilogram'
    elif money_df.loc[i, 'B'] in ml_list:
        money_df.loc[i, 'B'] = 'milliliter'
    elif money_df.loc[i, 'B'] in lb_list:
        money_df.loc[i,'B'] = 'pound'
    else:
        money_df.drop(i, inplace=True)

In [28]:
# reset index following elimination of rows
money_df = money_df.reset_index()

# drop old index column
money_df = money_df.drop(columns = ['index'])

# drop unneeded columns
money_df = money_df.drop(columns=['units','C','D'])

# rename columns for clarity
money_df = money_df.rename(columns={'A':'unit_amt', 'B':'unit_measurement'})
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,numeric_cost,unit_amt,unit_measurement
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,USD,[30.00],12,ounce
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,USD,[160.00],12,ounce
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,USD,[28.00],12,ounce
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,USD,[20.50],12,ounce
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,USD,[18.99],8,ounce


In [29]:
# convert unit_amt column to a float in order to complete conversion
money_df['unit_amt'] = money_df['unit_amt'].astype(float)

In [30]:
# conversion rates to gram
gram = 0.035
kg = 35.274
mL = 0.034
lb = 453.592

# convert all unit_amt to ounces
for i in range(len(money_df)):
    if money_df.loc[i,'unit_measurement'] == 'gram':
        money_df.loc[i, 'unit_amt'] = money_df.loc[i, 'unit_amt']* gram
        money_df.loc[i,'unit_measurement'] = 'ounce'
    if money_df.loc[i,'unit_measurement'] == 'kilogram':
        money_df.loc[i, 'unit_amt'] = money_df.loc[i, 'unit_amt']* kg
        money_df.loc[i,'unit_measurement'] = 'ounce'
    if money_df.loc[i,'unit_measurement'] == 'milliliter':
        money_df.loc[i, 'unit_amt'] = money_df.loc[i, 'unit_amt']* mL
        money_df.loc[i,'unit_measurement'] = 'ounce'
    if money_df.loc[i,'unit_measurement'] == 'pound':
        money_df.loc[i, 'unit_amt'] = money_df.loc[i, 'unit_amt']* lb
        money_df.loc[i,'unit_measurement'] = 'ounce'

In [31]:
# update/drop columns to reflect changes
money_df = money_df.rename(columns={'unit_amt': "ounces"})
money_df = money_df.drop(columns = ['unit_measurement'])



### Convert the cost to USD

In [32]:
# check the type of the numeric cost column to prep for conversions
type(money_df['numeric_cost'][0])

list

In [33]:
# change from a list to a str
money_df['numeric_cost'] = money_df['numeric_cost'].apply(''.join)

# check the conversion
type(money_df['numeric_cost'][0])

str

In [34]:
# remove all the commas from the price
money_df['numeric_cost'] = money_df['numeric_cost'].str.replace(',', '')


In [35]:
# change from str to float
money_df['numeric_cost'] = money_df['numeric_cost'].astype(float)

# check the conversion
type(money_df['numeric_cost'][0])

numpy.float64

In [36]:
# run this command on the first trial to install the exchange rate package
#%pip install easy-exchange-rates

In [37]:
# used easy-exchange-rates package for a more dynamic exchange rate and not rely on a specific date
today = date.today().strftime('%Y-%m-%d')

#Define api variable
api = API()

# Define a function to convert prices to USD
def GetScalingData(c1):
    time_series = api.get_exchange_rates(
    base_currency='USD', 
    start_date=today, 
    end_date=today, 
    targets=["USD",c1]
     )

    data_frame = api.to_dataframe(time_series)
    rate=data_frame[c1][0]
    return rate



In [38]:
# convert all price to USD
for i in range(len(money_df)):
    if money_df.loc[i,'currency'] == 'USD':
        money_df.loc[i, 'currency'] = 1
    if money_df.loc[i,'currency'] == 'CAD':
        money_df.loc[i, 'currency'] = GetScalingData('CAD')
    if money_df.loc[i,'currency'] == 'NTD':
        money_df.loc[i, 'currency'] = GetScalingData('TWD')
    if money_df.loc[i,'currency'] == 'HKD':
        money_df.loc[i, 'currency'] = GetScalingData('HKD')
    if money_df.loc[i,'currency'] == 'JPY':
        money_df.loc[i, 'currency'] = GetScalingData('JPY')
    if money_df.loc[i,'currency'] == 'MXN':
        money_df.loc[i, 'currency'] = GetScalingData('MXN')
    if money_df.loc[i,'currency'] == 'LAK':
        money_df.loc[i, 'currency'] = GetScalingData('LAK')
    if money_df.loc[i,'currency'] == 'AUD':
        money_df.loc[i, 'currency'] = GetScalingData('AUD')
    if money_df.loc[i,'currency'] == 'MYR':
        money_df.loc[i, 'currency'] = GetScalingData('MYR')
    if money_df.loc[i,'currency'] == 'KRW':
        money_df.loc[i, 'currency'] = GetScalingData('KRW')
    if money_df.loc[i,'currency'] == 'GBP':
        money_df.loc[i, 'currency'] = GetScalingData('GBP')
    if money_df.loc[i,'currency'] == 'IDR':
        money_df.loc[i, 'currency'] = GetScalingData('IDR')
    if money_df.loc[i,'currency'] == 'EUR':
        money_df.loc[i, 'currency'] = GetScalingData('EUR')
    if money_df.loc[i,'currency'] == 'THB':
        money_df.loc[i, 'currency'] = GetScalingData('THB')
    if money_df.loc[i,'currency'] == 'AED':
        money_df.loc[i, 'currency'] = GetScalingData('AED')
    if money_df.loc[i,'currency'] == 'GTQ':
        money_df.loc[i, 'currency'] = GetScalingData('GTQ')


# change the column to a float for conversion
money_df['currency'] = money_df['currency'].astype(float)


In [39]:
# check the conversion
money_df.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,currency,numeric_cost,ounces
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,1.0,30.0,12.0
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,1.0,160.0,12.0
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,1.0,28.0,12.0
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,1.0,20.5,12.0
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,1.0,18.99,8.0


### Calculate the price as USD/oz

In [40]:
# create a new column for the conversion
money_df['cost_usd'] = None

# multiply the numeric_cost and currency(conversion rate) columns to get the price in USD
for i in range(len(money_df)):
    money_df.loc[i,'cost_usd'] = money_df.loc[i,'numeric_cost'] / money_df.loc[i,'currency']



In [41]:
# create a new column for the standardized cost per ounce
money_df['usd_per_oz'] = None

# divide the cost_usd by ounces to get the price in USD per ounce
for i in range(len(money_df)):
    money_df.loc[i,'usd_per_oz'] = np.round(money_df.loc[i,'cost_usd'] / money_df.loc[i,'ounces'], 2)

In [42]:
# drop all other columns
money_df = money_df.drop(columns = ['numeric_cost', 'currency', 'ounces', 'cost_usd'])

In [43]:
#pd.set_option('display.max_rows', None)

money_df.head(10)

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,usd_per_oz
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,2.5
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,13.33
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,2.33
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,1.71
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,2.37
5,The Glistening Orchard Blend,91.0,7.0,8.0,9.0,8.0,9.0,"Fruity and crisply chocolaty. Green banana, ca...",Colombia; Ethiopia,A blend of three coffees — a washed coffee fer...,Medium-Light,Taipei,Taiwan,1.88
6,Tinamit Tolimán,93.0,8.0,8.0,9.0,9.0,9.0,"Deeply sweet-tart, chocolate-toned. Dark choco...","San Lucas Tolimán, Lake Atitlán growing region...",Produced by smallholding farmers of Asociación...,Medium-Light,Antigua,Guatemala,1.33
7,Colombia La Esperanza 100% Geisha Hanashaku,94.0,9.0,8.0,9.0,9.0,9.0,"Delicate, elegant, sweetly bright. Bergamot, c...","Valle de Cauca, Colombia","Produced at Finca Cerro Azul, entirely of the ...",Light,Taoyuan,Taiwan,4.1
8,Colombia El Paraiso Floral Lychee,93.0,8.0,8.0,9.0,9.0,9.0,"Floral-toned, richly sweet-savory. Black cherr...","Cauca Department, Colombia",Produced by Diego Bermudez entirely of the Cas...,Light,Taipei,Taiwan,2.87
9,Ethiopia Bekele Heto Natural,93.0,8.0,8.0,9.0,9.0,9.0,"Berry-driven, invitingly sweet-tart. Dried mul...","Worka-Sakaro, Gedeb District, Gedeo Zone, Ethi...",Produced by Bekele Heto from largely indigenou...,Light,Oakland,California,2.1


In [44]:
money_df['usd_per_oz'] = money_df['usd_per_oz'].astype(float)

In [45]:
money_df.describe()

Unnamed: 0,rating,acidity_structure,aftertaste,aroma,body,flavor,usd_per_oz
count,2103.0,2103.0,2103.0,2103.0,2103.0,2103.0,2103.0
mean,93.055635,8.504993,8.099857,8.843081,8.634332,8.975273,3.084727
std,1.817623,0.590802,0.509206,0.453455,0.507696,0.388166,7.859612
min,63.0,1.0,3.0,2.0,5.0,2.0,0.01
25%,92.0,8.0,8.0,9.0,8.0,9.0,1.42
50%,93.0,9.0,8.0,9.0,9.0,9.0,1.67
75%,94.0,9.0,8.0,9.0,9.0,9.0,2.61
max,98.0,10.0,9.0,10.0,10.0,10.0,234.06


In [46]:
# convert to a CSV for analysis
money_df.to_csv("final_coffee_data.csv")