In [65]:
import pandas as pd
import os


In [66]:
cacao_data = pd.read_csv("./data/2024_flavors_of_cacoa.tsv", sep='\t')

In [67]:
cacao_data['cocoa_percentage'] = cacao_data['Cocoa Percent'].str.rstrip('%').astype(float) / 100

In [68]:
cacao_data

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,cocoa_percentage
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,0.76
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50,0.76
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,0.76
3,2542,5150,U.S.A.,2021,India,"Anamalai, batch 1",68%,"3- B,S,C","milk brownie, macadamia,chewy",3.50,0.68
4,2546,5150,U.S.A.,2021,Uganda,"Semuliki Forest, batch 1",80%,"3- B,S,C","mildly bitter, basic cocoa, fatty",3.25,0.80
...,...,...,...,...,...,...,...,...,...,...,...
2784,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75,0.80
2785,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75,0.75
2786,2170,Zotter,Austria,2018,Belize,Maya Mtn,72%,"3- B,S,C","muted, roasty, accessible",3.50,0.72
2787,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25,0.70


In [69]:
characteristics_dummies = cacao_data['Most Memorable Characteristics'].str.get_dummies(sep=', ')
characteristic_counts = characteristics_dummies.sum().sort_values(ascending=False)

In [70]:
top_characteristics = characteristic_counts[characteristic_counts >= 20].index

In [71]:
frequent_characteristics_data = characteristics_dummies[top_characteristics]

In [72]:
cacao_data = pd.concat([cacao_data, frequent_characteristics_data], axis=1)
cacao_data = cacao_data.drop(columns=['Ingredients', 'Most Memorable Characteristics'])

In [73]:
cacao_data

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Rating,cocoa_percentage,sweet,...,black pepper,marshmallow,green,chemical,bland,tangy,nuts,raisins,licorice,orange
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,3.75,0.76,0,...,0,0,0,0,0,0,0,0,0,0
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,3.50,0.76,0,...,0,0,0,0,0,0,0,0,0,0
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,3.25,0.76,0,...,0,0,0,0,0,0,0,0,0,0
3,2542,5150,U.S.A.,2021,India,"Anamalai, batch 1",68%,3.50,0.68,0,...,0,0,0,0,0,0,0,0,0,0
4,2546,5150,U.S.A.,2021,Uganda,"Semuliki Forest, batch 1",80%,3.25,0.80,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2784,1205,Zotter,Austria,2014,Blend,Raw,80%,2.75,0.80,0,...,0,0,0,0,0,0,0,0,0,0
2785,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,3.75,0.75,0,...,0,1,0,0,0,0,0,0,0,0
2786,2170,Zotter,Austria,2018,Belize,Maya Mtn,72%,3.50,0.72,0,...,0,0,0,0,0,0,0,0,0,0
2787,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,3.25,0.70,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
if not os.path.exists('data'):
    os.makedirs('data')

In [75]:
cacao_data.to_csv('data/cleaned_data_2025_flavors_of_cacao.csv', index=False)

In [76]:
reduced_cacao_data = cacao_data[['Review Date', 'Country of Bean Origin', 'cocoa_percentage', 'Rating']]
reduced_cacao_data.to_csv('data/data_reduced_2025_flavors_of_cacao.csv', index=False)
reduced_cacao_data.to_json('data/data_reduced_2025_flavors_of_cacao.json', orient='records')

In [77]:
filtered_cacao_data = cacao_data[
    (cacao_data['Rating'] >= 3.25) &
    (cacao_data['cocoa_percentage'] >= 0.65) & (cacao_data['cocoa_percentage'] <= 0.75) &
    (cacao_data['Review Date'] >= 2018) & (cacao_data['Review Date'] <= 2021) &
    (cacao_data[['fatty', 'earthy', 'roasty']].any(axis=1))
]

In [78]:
filtered_cacao_data.to_csv('data/filtered_data_2025_flavors_of_cacao.csv', index=False)
filtered_cacao_data.to_json('data/filtered_data_2025_flavors_of_cacao.json', orient='records')