In [None]:
import csv
import pandas as pd
import numpy as np
import ast

In [None]:
df1 = pd.read_csv('raw_data.csv')
df1

In [None]:
df2 = pd.read_csv('extra_raw_data.csv')
df2

In [None]:
df = pd.concat([df1,df2], ignore_index=True)

In [None]:
df

In [None]:
error_rows = df[
    df.apply(lambda row: row.astype(str).str.contains('error', case=False, na=False).any(), axis=1) &
    df.notna().any(axis=1)
]
error_rows

In [None]:
len(error_rows)

In [None]:
df = df.drop(error_rows.index)

In [None]:
df

In [None]:
empty_accords = df[df['accords'].apply(lambda x: isinstance(x, dict) and not x)]

In [None]:
empty_accords

In [None]:
ast.literal_eval(df.loc[2, 'accords'])

In [None]:
empty_accords = df[df['accords'] == '{}']

In [None]:
empty_accords

In [None]:
df = df.drop(empty_accords.index)

In [None]:
df['ratingCount'] = df['ratingCount'].astype(str).str.replace(',', '', regex=True)
df['ratingCount'] = pd.to_numeric(df['ratingCount'], errors='coerce')

df['ratingValue'] = df['ratingValue'].astype(str).str.replace(',', '', regex=True)
df['ratingValue'] = pd.to_numeric(df['ratingValue'], errors='coerce')

In [None]:
stats = df[['ratingCount', 'ratingValue']].describe()

In [None]:
stats

In [None]:
rating_count_cutoff = df['ratingCount'].quantile(0.10)

In [None]:
rating_value_cutoff = df['ratingValue'].quantile(0.15)

In [None]:
rating_count_cutoff, rating_value_cutoff

In [None]:
df_cutoff = df[(df['ratingCount'] <= rating_count_cutoff) | (df['ratingValue'] <= rating_value_cutoff)]

In [None]:
df_cutoff

In [None]:
from collections import Counter

In [None]:
name_counts = Counter(df_cutoff['brand'])

In [None]:
name_counts

In [None]:
df_cutoff[df_cutoff['brand'].str.lower() == 'Bond No 9'.lower()]

In [None]:
df = df.drop(df_cutoff.index)

In [None]:
df

In [None]:
df['accords'] = df['accords'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('{') else x)

In [None]:
accord_counter = Counter()
for accords in df['accords']:
    accord_counter.update(accords.keys())

In [None]:
accord_counts = dict(accord_counter)
accord_counts

In [None]:
accord_groups = {
    "Woody & Earthy": {"woody", "mossy", "patchouli", "earthy", "conifer"},
    "Smoky & Leathery": {"smoky", "leather", "animalic", "tobacco"},
    "Resinous & Balsamic": {"amber", "balsamic", "vanilla"},
    "Citrus & Fresh": {"citrus", "fresh", "marine", "ozonic", "aquatic"},
    "Green & Herbal": {"green", "herbal", "aromatic", "lavender"},
    "Warm & Spicy": {"warm spicy", "cinnamon", "soft spicy"},
    "Sweet & Gourmand": {"sweet", "honey", "caramel", "chocolate", "cacao", "coffee", "nutty", "almond"},
    "Floral": {"floral", "white floral", "yellow floral", "rose", "violet", "tuberose", "iris"},
    "Powdery & Soft": {"powdery", "musky", "soapy", "lactonic"},
    "Synthetic": {"metallic", "aldehydic", "mineral", "vinyl", "alcohol"},
    "Uncommon": {"cannabis", "coca-cola", "Champagne", "whiskey", "vodka", "savory", "sand", "beeswax", "bitter", "sour", "terpenic"},
}

In [None]:
total_fragrances = len(df)

In [None]:
def compute_tfidf(accords_dict):
    grouped_strengths = {}
    
    for group, accords in accord_groups.items():
        group_strength = 0
        for accord in accords:
            if accord in accords_dict:
                tf = accords_dict[accord]
                df_count = sum([1 for row in df['accords'] if accord in row])
                idf = np.log((total_fragrances + 1) / (df_count + 1))
                group_strength += tf * idf
        
        grouped_strengths[group] = group_strength
    
    return grouped_strengths

In [None]:
df['grouped_accords'] = df['accords'].apply(compute_tfidf)

In [None]:
grouped_df = df['grouped_accords'].apply(pd.Series).fillna(0)

In [None]:
df = df.drop(columns=['grouped_accords']).join(grouped_df)

In [None]:
df

In [None]:
gender_mapping = {
    "female": -2, "more female": -1, "unisex": 0,
    "more male": 1, "male": 2
}

price_mapping = {
    "way overpriced": -2, "overpriced": -1, "ok": 0,
    "good value": 1, "great value": 2
}

In [None]:
def compute_weighted_score(vote_dict, mapping):
    if isinstance(vote_dict, str):
        vote_dict = ast.literal_eval(vote_dict)
    
    total_votes = sum(vote_dict.values())
    if total_votes == 0:
        return 0
    
    weighted_sum = sum(mapping[key] * count for key, count in vote_dict.items() if key in mapping)
    return weighted_sum / total_votes

In [None]:
df['gender_score'] = df['gender'].apply(lambda x: compute_weighted_score(x, gender_mapping))
df['priceValue_score'] = df['priceValue'].apply(lambda x: compute_weighted_score(x, price_mapping))

In [None]:
df

In [None]:
def compute_time(tod_dict):
    if isinstance(tod_dict, str):
        tod_dict = ast.literal_eval(tod_dict)

    return tod_dict.get('day', 0) - tod_dict.get('night', 0)

In [None]:
df['timeOfDay_score'] = df['timeOfDay'].apply(compute_time)

In [None]:
df

In [None]:
def compute_season_score(votes):
    if isinstance(votes, str):
        votes = ast.literal_eval(votes)
    return (votes.get('summer',0)+votes.get('spring',0)) - (votes.get('fall', 0) + votes.get('winter', 0))
    

In [None]:
season_mapping = {
    "winter": -1, "fall": -0.5, "spring": 0.5, "winter": 1
}

In [None]:
df['season_score'] = df['seasons'].apply(compute_season_score)

In [None]:
df

In [None]:
df.columns.tolist()

In [None]:
selected = ['name', 'brand', 'ratingValue', 'ratingCount', 'gender_score', 'priceValue_score', 'timeOfDay_score', 'season_score'] + list(accord_groups.keys()) + ['notesBreakdown']

In [None]:
selected

In [None]:
newdf = df[selected].copy()

In [None]:
newdf

In [None]:
newdf.loc[:, list(accord_groups.keys())] = newdf.loc[:, list(accord_groups.keys())].round(5)
newdf.loc[:, ['gender_score', 'priceValue_score', 'timeOfDay_score', 'season_score']] = newdf.loc[:, ['gender_score', 'priceValue_score', 'timeOfDay_score', 'season_score']].round(5)

In [None]:
newdf = newdf.sort_values(by=["brand", "name"])

In [None]:
df['notesBreakdown'] = df['notesBreakdown'].apply(lambda x: ast.literal_eval(x))

In [None]:
newdf = newdf.reset_index(drop=True)

In [None]:
newdf

In [None]:
newdf.to_csv("fragrance_data.csv", index=False)

In [None]:
alldata = newdf.describe()

In [None]:
alldata

In [None]:
newdf.columns.tolist()