In [None]:
import pandas as pd

# Step 1: Load raw data
df = pd.read_csv('./data/2024_flavors_of_cacoa.tsv', sep='\t')

# Step 2: Convert Cocoa Percent
df['Cocoa Percent'] = df['Cocoa Percent'].str.replace('%', '').astype(float) / 100

# Step 3: Split Ingredients
df[['ingredient_count', 'ingredient_codes']] = df['Ingredients'].str.split('-', expand=True)
df['ingredient_count'] = df['ingredient_count'].str.strip().replace('', None).astype(float).astype('Int64')
df['ingredient_codes'] = df['ingredient_codes'].str.strip()
ingredient_dummies = df['ingredient_codes'].str.get_dummies(sep=',')
df = pd.concat([df, ingredient_dummies], axis=1)

# Step 4: Clean column names
df.columns = df.columns.str.strip()

# Step 5: One-hot encode Characteristics
char_dummies = df['Most Memorable Characteristics'].str.get_dummies(sep=',')
char_dummies.columns = char_dummies.columns.str.strip()
char_counts = char_dummies.sum().sort_values(ascending=False)
frequent_chars = char_counts[char_counts >= 20].index
char_dummies_filtered = char_dummies[frequent_chars]
df = pd.concat([df, char_dummies_filtered], axis=1)

# Step 6: Drop unnecessary columns
df.drop(columns=['Ingredients', 'Most Memorable Characteristics', 'ingredient_codes'], inplace=True)
df = df.loc[:, ~df.columns.duplicated()]

# Step 7: Export cleaned full data
df.to_csv('./data/cleaned_chocolate_data.csv', index=False)
print("✅ Exported cleaned_chocolate_data.csv")

# Step 8: Apply filters for final step
filtered_df = df[
    (df['Rating'] >= 3.25) &
    (df['Cocoa Percent'] >= 0.65) & (df['Cocoa Percent'] <= 0.75) &
    (df['Review Date'] >= 2018) & (df['Review Date'] <= 2021) &
    ((df['fatty'] == 1) | (df['earthy'] == 1) | (df['roasty'] == 1))
]

# Step 9: Export filtered data
filtered_df.to_csv('./data/data_filtered_2025_flavors_of_cacao.csv', index=False)
filtered_df.to_json('./data/data_filtered_2025_flavors_of_cacao.json', orient='records')
print("✅ Exported data_filtered_2025_flavors_of_cacao.csv and .json")

# Final check
print(f"✅ Total rows after filtering: {filtered_df.shape[0]}")
