In [2]:
import pandas as pd

# Step 1: Load raw chocolate dataset
df = pd.read_csv('./data/2024_flavors_of_cacoa.tsv', sep='\t')

# Step 2: Convert Cocoa Percent to float
df['Cocoa Percent'] = df['Cocoa Percent'].str.replace('%', '').astype(float) / 100

# Step 3: Split Ingredients
df[['ingredient_count', 'ingredient_codes']] = df['Ingredients'].str.split('-', expand=True)
df['ingredient_count'] = df['ingredient_count'].str.strip().replace('', None).astype(float).astype('Int64')
df['ingredient_codes'] = df['ingredient_codes'].str.strip()
ingredient_dummies = df['ingredient_codes'].str.get_dummies(sep=',')
df = pd.concat([df, ingredient_dummies], axis=1)

# Step 4: Clean column names
df.columns = df.columns.str.strip()

# Step 5: One-hot encode Most Memorable Characteristics
char_dummies = df['Most Memorable Characteristics'].str.get_dummies(sep=',')
char_dummies.columns = char_dummies.columns.str.strip()

# Step 6: Keep only characteristics that occur >= 20 times
char_counts = char_dummies.sum().sort_values(ascending=False)
frequent_chars = char_counts[char_counts >= 20].index
char_dummies_filtered = char_dummies[frequent_chars]
df = pd.concat([df, char_dummies_filtered], axis=1)

# Step 7: Drop unnecessary columns
df.drop(columns=['Ingredients', 'Most Memorable Characteristics', 'ingredient_codes'], inplace=True)

# Step 8: Remove any duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Step 9: Final check
print("✅ Final column count:", df.shape[1])  # Should be 86
print("✅ Total rows:", df.shape[0])          # Should be 2789
print("✅ Characteristics kept:", len(frequent_chars))  # Should be 71

# Step 10: Export CSV
df.to_csv('./data/cleaned_chocolate_data.csv', index=False)
print("📦 Exported cleaned_chocolate_data.csv")


✅ Final column count: 72
✅ Total rows: 2789
✅ Characteristics kept: 77
📦 Exported cleaned_chocolate_data.csv
