In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_excel('Scraping_Finished.xlsx')
df.head()

Unnamed: 0,product_name,collection_name,description,gender,product_standard_price,stock_status,ingredient,ingredient_number,size,size_number,note,note_number
0,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,Regular - 50 ml,1.0,,
1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,Mini - 15 ml,2.0,,
2,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Wild Orchid,3.0
3,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Himalayan Peony,2.0
4,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Musk,1.0


In [18]:
df['product_id'] = (df['product_name'] != df['product_name'].shift()).cumsum()

columns = df.columns.tolist()
columns = ['product_id'] + [col for col in columns if col != 'product_id']
df = df[columns]
df.head()

Unnamed: 0,product_id,product_name,collection_name,description,gender,product_standard_price,stock_status,ingredient,ingredient_number,size,size_number,note,note_number
0,1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,Regular - 50 ml,1.0,,
1,1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,Mini - 15 ml,2.0,,
2,1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Wild Orchid,3.0
3,1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Himalayan Peony,2.0
4,1,Fleurs de l'Himalaya,THE ICONIC COLLECTION,"eau de parfum, 50 ml",Women,"€49,90",In stock,,,,,Musk,1.0


In [19]:
products = df.groupby('product_name').agg({
    'collection_name': 'first',
    'gender': 'first',
    'description': 'first',
    'stock_status': 'first'
}).reset_index()
products['product_id'] = range(1, len(products) + 1)
products = products[['product_id', 'product_name', 'collection_name', 'gender', 'description', 'stock_status']]

products.head()

Unnamed: 0,product_id,product_name,collection_name,gender,description,stock_status
0,1,Bleu Byzantin,THE ICONIC COLLECTION,Men,"eau de parfum, 50 ml",In stock
1,2,Bois Royal,THE ICONIC COLLECTION,Men,"eau de parfum, 50 ml",In stock
2,3,Ciel Rouge,THE ICONIC COLLECTION,Women,"eau de parfum, 50 ml",In stock
3,4,Eau de Parfum Oudh,THE RITUAL OF OUDH,Women,"Eau de Parfum Oudh, 50 ml",In stock
4,5,Fleurs de l'Himalaya,THE ICONIC COLLECTION,Women,"eau de parfum, 50 ml",In stock


In [20]:
# 2. Kollektionen-Tabelle
collections = df[['collection_name']].drop_duplicates()
collections['collection_id'] = range(1, len(collections) + 1)
collections = collections[['collection_id', 'collection_name']]
collections.head()

Unnamed: 0,collection_id,collection_name
0,1,THE ICONIC COLLECTION
218,2,THE RITUAL OF OUDH
304,3,HOMME


In [21]:
notes = df[df['note'].notna()][['product_name', 'note', 'note_number']]
notes['note_id'] = range(1, len(notes) + 1)
notes = notes.merge(products[['product_id', 'product_name']], on='product_name', how='left')
notes = notes[['note_id', 'product_id', 'note', 'note_number']]
notes.head()

Unnamed: 0,note_id,product_id,note,note_number
0,1,5,Wild Orchid,3.0
1,2,5,Himalayan Peony,2.0
2,3,5,Musk,1.0
3,4,7,Neroli,3.0
4,5,7,Mimosa,2.0


In [22]:
ingredients = df[df['ingredient'].notna()][['product_name', 'ingredient', 'ingredient_number']]
ingredients['ingredient_id'] = range(1, len(ingredients) + 1)
ingredients = ingredients.merge(products[['product_id', 'product_name']], on='product_name', how='left')
ingredients = ingredients[['ingredient_id', 'product_id', 'ingredient', 'ingredient_number']]

ingredients.head()

Unnamed: 0,ingredient_id,product_id,ingredient,ingredient_number
0,1,5,Alcohol Denat.,10.0
1,2,5,Parfum/Fragrance,9.0
2,3,5,Aqua/Water,8.0
3,4,5,Limonene,7.0
4,5,5,Linalool,6.0


In [23]:
prices = df[['product_name', 'product_standard_price']].drop_duplicates()
prices['price_id'] = range(1, len(prices) + 1)
prices = prices.merge(products[['product_id', 'product_name']], on='product_name', how='left')
prices = prices[['price_id', 'product_id', 'product_standard_price']]

prices.head()

Unnamed: 0,price_id,product_id,product_standard_price
0,1,5,"€49,90"
1,2,7,"€49,90"
2,3,12,"€49,90"
3,4,20,"€17,90"
4,5,16,"€17,90"


In [24]:
products = products.merge(collections, on='collection_name', how='left')
products = products[['product_id', 'product_name', 'collection_id', 'gender', 'description', 'stock_status']]

products.head()

Unnamed: 0,product_id,product_name,collection_id,gender,description,stock_status
0,1,Bleu Byzantin,1,Men,"eau de parfum, 50 ml",In stock
1,2,Bois Royal,1,Men,"eau de parfum, 50 ml",In stock
2,3,Ciel Rouge,1,Women,"eau de parfum, 50 ml",In stock
3,4,Eau de Parfum Oudh,2,Women,"Eau de Parfum Oudh, 50 ml",In stock
4,5,Fleurs de l'Himalaya,1,Women,"eau de parfum, 50 ml",In stock


In [25]:
class FeatureEncoders:
    def __init__(self):
        self.gender_encoder = LabelEncoder()
        self.collection_encoder = LabelEncoder()
        self.note_encoder = MultiLabelBinarizer()

In [26]:
def prepare_data():
    # Verbinden der Tabellen
    df = products.merge(collections, on='collection_id')
    df = df.merge(prices, on='product_id')
    
    # Aggregieren der Noten und Zutaten pro Produkt
    notes_agg = notes.groupby('product_id')['note'].apply(list).reset_index()
    ingredients_agg = ingredients.groupby('product_id')['ingredient'].apply(list).reset_index()
    
    df = df.merge(notes_agg, on='product_id', how='left')
    df = df.merge(ingredients_agg, on='product_id', how='left')
    
    # Konvertieren der Preisinformationen
    df['price'] = df['product_standard_price'].str.replace('€', '').str.replace(',', '.').astype(float)
    
    return df

In [27]:
def engineer_features(df, encoders=None):
    if encoders is None:
        encoders = FeatureEncoders()
        # Encoder mit allen möglichen Werten trainieren
        encoders.gender_encoder.fit(df['gender'])
        encoders.collection_encoder.fit(df['collection_name'])
        
        # Alle verfügbaren Noten für das Training des Encoders sammeln
        all_notes = [note for notes_list in df['note'].dropna() for note in notes_list]
        encoders.note_encoder.fit([set(all_notes)])

    # Kategorische Variablen encodieren
    df['gender_encoded'] = encoders.gender_encoder.transform(df['gender'])
    df['collection_encoded'] = encoders.collection_encoder.transform(df['collection_name'])
    
    # Noten in binäre Spalten umwandeln
    note_features = encoders.note_encoder.transform(df['note'].fillna('').apply(lambda x: x if isinstance(x, list) else []))
    note_columns = [f'note_{note}' for note in encoders.note_encoder.classes_]
    note_df = pd.DataFrame(note_features, columns=note_columns)
    
    # Zusammenführen der Features
    df = pd.concat([df, note_df], axis=1)
    
    return df, encoders


In [28]:
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    print(f"Model accuracy: {model.score(X_test, y_test)}")
    
    return model

In [29]:
def create_recommendation_system():
    df = prepare_data()
    df, encoders = engineer_features(df)
    
    # Features aktualisieren um alle Noten-Spalten einzuschließen
    note_features = [f'note_{note}' for note in encoders.note_encoder.classes_]
    features = ['price', 'gender_encoded', 'collection_encoded'] + note_features
    
    X = df[features]
    y = df['product_id']
    
    model = train_model(X, y)
    
    return model, df, features, encoders

In [None]:
def get_user_preferences(df, encoders):
    # Basisfeatures bestimmen
    note_features = [f'note_{note}' for note in encoders.note_encoder.classes_]
    all_features = ['price', 'gender_encoded'] + note_features
    
    preferences = pd.DataFrame(columns=all_features)
    
    # Preis sammeln
    price = float(input("Gewünschter Preis (in Euro): "))
    
    # Geschlecht sammeln
    while True:
        print("\nGeschlecht-Optionen:")
        print("1. Women")
        print("2. Men")
        print("3. Divers (zeigt Produkte für alle Geschlechter)")
        gender_choice = input("Bitte wählen Sie eine Option (1/2/3): ")
        
        gender_map = {"1": "Women", "2": "Men", "3": "Divers"}
        if gender_choice in gender_map:
            gender = gender_map[gender_choice]
            gender_encoded = encoders.gender_encoder.transform([gender])[0]
            break
        print("Ungültige Auswahl. Bitte wählen Sie 1, 2 oder 3.")
    
    # Noten sammeln
    print("\nVerfügbare Noten:")
    available_notes = list(encoders.note_encoder.classes_)
    for i, note in enumerate(available_notes):
        print(f"{note}", end=", " if i < len(available_notes)-1 else "\n")
    
    selected_notes = []
    while True:
        note = input("\nGeben Sie eine gewünschte Note ein (oder 'fertig' zum Beenden): ")
        if note.lower() == 'fertig':
            break
        if note in available_notes:
            selected_notes.append(note)
        else:
            print("Diese Note ist nicht verfügbar. Bitte wählen Sie aus der Liste.")
    
    # Preferences DataFrame erstellen
    preferences.loc[0] = 0  # Alle Spalten mit 0 initialisieren
    preferences.loc[0, 'price'] = price
    preferences.loc[0, 'gender_encoded'] = gender_encoded
    
    # Ausgewählte Noten auf 1 setzen
    for note in selected_notes:
        preferences.loc[0, f'note_{note}'] = 1
    
    return preferences, gender

In [31]:
def recommend_product(model, preferences, features, df):
    prediction = model.predict(preferences[features])
    recommended_product = df[df['product_id'] == prediction[0]].iloc[0]
    return recommended_product

In [34]:
model, df, features, encoders = create_recommendation_system()

print("\nVerfügbare Kollektionen:", ", ".join(encoders.collection_encoder.classes_))

# Benutzereinstellungen als DataFrame
user_prefs = get_user_preferences(df, encoders)

# Empfehlung generieren
recommended_product = recommend_product(model, user_prefs, features, df)

Model accuracy: 0.0

Verfügbare Kollektionen: HOMME, THE ICONIC COLLECTION, THE RITUAL OF OUDH
Gewünschter Preis (in Euro): 49.9
Geschlecht (Women/Men): Women
Bevorzugte Kollektion: THE ICONIC COLLECTION

Verfügbare Noten:
Ambrette, Arabian Amber, Bergamot, Blackcurrant, Blue Juniper Berry, Cardamom, Cedar Wood, Cherry Blossom, Cypress, Himalayan Peony, Leather, Lily of the Valley, Lotus Flower, Lychee, Mandarin, Mimosa, Musk, Nashi Pear, Neroli, Orris, Patchouli, Pine, Pink Pepper, Sage, Sandalwood, Tuberose, Vanilla, Violet, Water Mint, White Musk, White Pepper, Wild Orchid

Geben Sie eine gewünschte Note ein (oder 'fertig' zum Beenden): Wild Orchid	
Diese Note ist nicht verfügbar. Bitte wählen Sie aus der Liste.

Geben Sie eine gewünschte Note ein (oder 'fertig' zum Beenden): Himalayan Peony

Geben Sie eine gewünschte Note ein (oder 'fertig' zum Beenden): Musk

Geben Sie eine gewünschte Note ein (oder 'fertig' zum Beenden): fertig


In [35]:
print(f"\nEmpfohlenes Produkt: {recommended_product['product_name']}")
print(f"Beschreibung: {recommended_product['description']}")
print(f"Preis: {recommended_product['product_standard_price']}")
print(f"Kollektion: {recommended_product['collection_name']}")
print(f"Geschlecht: {recommended_product['gender']}")
print(f"Noten: {', '.join(recommended_product['note']) if isinstance(recommended_product['note'], list) else 'Keine'}")
print(f"Inhaltsstoffe: {', '.join(recommended_product['ingredient']) if isinstance(recommended_product['ingredient'], list) else 'Keine'}")


Empfohlenes Produkt: Fleurs de l'Himalaya
Beschreibung: eau de parfum, 50 ml
Preis: €49,90
Kollektion: THE ICONIC COLLECTION
Geschlecht: Women
Noten: Wild Orchid, Himalayan Peony, Musk
Inhaltsstoffe: Alcohol Denat., Parfum/Fragrance, Aqua/Water, Limonene, Linalool, Hexyl Cinnamal, Citral, Citronellol, Geraniol, Eugenol.
