In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel('Rituals_Product_Scraping.xlsx')
df.head()

Unnamed: 0,product_name,collection_name,note,note_number,ingredient,ingredient_number,gender,product_standard_price,description,stock_status
0,Ciel Rouge,THE ICONIC COLLECTION,Blackcurrant,3.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
1,Ciel Rouge,THE ICONIC COLLECTION,Tuberose,2.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
2,Ciel Rouge,THE ICONIC COLLECTION,Vanilla,1.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
3,Ciel Rouge,THE ICONIC COLLECTION,,,Alcohol Denat.,23.0,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
4,Ciel Rouge,THE ICONIC COLLECTION,,,Parfum/Fragrance,22.0,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status


In [7]:
df['product_id'] = (df['product_name'] != df['product_name'].shift()).cumsum()

columns = df.columns.tolist()
columns = ['product_id'] + [col for col in columns if col != 'product_id']
df = df[columns]
df.head()

Unnamed: 0,product_id,product_name,collection_name,note,note_number,ingredient,ingredient_number,gender,product_standard_price,description,stock_status
0,1,Ciel Rouge,THE ICONIC COLLECTION,Blackcurrant,3.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
1,1,Ciel Rouge,THE ICONIC COLLECTION,Tuberose,2.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
2,1,Ciel Rouge,THE ICONIC COLLECTION,Vanilla,1.0,,,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
3,1,Ciel Rouge,THE ICONIC COLLECTION,,,Alcohol Denat.,23.0,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status
4,1,Ciel Rouge,THE ICONIC COLLECTION,,,Parfum/Fragrance,22.0,Women,"€49,90","eau de parfum, 50 ml",Unknown stock status


In [9]:
products = df.groupby('product_name').agg({
    'collection_name': 'first',
    'gender': 'first',
    'description': 'first',
    'stock_status': 'first'
}).reset_index()
products['product_id'] = range(1, len(products) + 1)
products = products[['product_id', 'product_name', 'collection_name', 'gender', 'description', 'stock_status']]

products.head()

Unnamed: 0,product_id,product_name,collection_name,gender,description,stock_status
0,1,Bleu Byzantin,THE ICONIC COLLECTION,Men,"eau de parfum, 50 ml",Unknown stock status
1,2,Bois Royal,THE ICONIC COLLECTION,Men,"eau de parfum, 50 ml",Unknown stock status
2,3,Ciel Rouge,THE ICONIC COLLECTION,Women,"eau de parfum, 50 ml",Unknown stock status
3,4,Eau de Parfum Oudh,THE RITUAL OF OUDH,Women,"Eau de Parfum Oudh, 50 ml",Unknown stock status
4,5,Fleurs de l'Himalaya,THE ICONIC COLLECTION,Women,"eau de parfum, 50 ml",Unknown stock status


In [11]:
# 2. Kollektionen-Tabelle
collections = df[['collection_name']].drop_duplicates()
collections['collection_id'] = range(1, len(collections) + 1)
collections = collections[['collection_id', 'collection_name']]
collections.head()

Unnamed: 0,collection_id,collection_name
0,1,THE ICONIC COLLECTION
83,2,THE RITUAL OF OUDH
275,3,HOMME


In [12]:
notes = df[df['note'].notna()][['product_name', 'note', 'note_number']]
notes['note_id'] = range(1, len(notes) + 1)
notes = notes.merge(products[['product_id', 'product_name']], on='product_name', how='left')
notes = notes[['note_id', 'product_id', 'note', 'note_number']]
notes.head()

Unnamed: 0,note_id,product_id,note,note_number
0,1,3,Blackcurrant,3.0
1,2,3,Tuberose,2.0
2,3,3,Vanilla,1.0
3,4,23,Nashi Pear,3.0
4,5,23,Lily of the Valley,2.0


In [13]:
ingredients = df[df['ingredient'].notna()][['product_name', 'ingredient', 'ingredient_number']]
ingredients['ingredient_id'] = range(1, len(ingredients) + 1)
ingredients = ingredients.merge(products[['product_id', 'product_name']], on='product_name', how='left')
ingredients = ingredients[['ingredient_id', 'product_id', 'ingredient', 'ingredient_number']]

ingredients.head()

Unnamed: 0,ingredient_id,product_id,ingredient,ingredient_number
0,1,3,Alcohol Denat.,23.0
1,2,3,Parfum/Fragrance,22.0
2,3,3,Aqua/Water,21.0
3,4,3,Benzyl Salicylate,20.0
4,5,3,Limonene,19.0


In [14]:
prices = df[['product_name', 'product_standard_price']].drop_duplicates()
prices['price_id'] = range(1, len(prices) + 1)
prices = prices.merge(products[['product_id', 'product_name']], on='product_name', how='left')
prices = prices[['price_id', 'product_id', 'product_standard_price']]

prices.head()

Unnamed: 0,price_id,product_id,product_standard_price
0,1,3,"€49,90"
1,2,23,"€17,90"
2,3,5,"€49,90"
3,4,17,"€17,90"
4,5,25,"€17,90"


In [15]:
products = products.merge(collections, on='collection_name', how='left')
products = products[['product_id', 'product_name', 'collection_id', 'gender', 'description', 'stock_status']]

products.head()

Unnamed: 0,product_id,product_name,collection_id,gender,description,stock_status
0,1,Bleu Byzantin,1,Men,"eau de parfum, 50 ml",Unknown stock status
1,2,Bois Royal,1,Men,"eau de parfum, 50 ml",Unknown stock status
2,3,Ciel Rouge,1,Women,"eau de parfum, 50 ml",Unknown stock status
3,4,Eau de Parfum Oudh,2,Women,"Eau de Parfum Oudh, 50 ml",Unknown stock status
4,5,Fleurs de l'Himalaya,1,Women,"eau de parfum, 50 ml",Unknown stock status


In [17]:
def prepare_data():
    # Verbinden der Tabellen
    df = products.merge(collections, on='collection_id')
    df = df.merge(prices, on='product_id')
    
    # Aggregieren der Noten und Zutaten pro Produkt
    notes_agg = notes.groupby('product_id')['note'].apply(list).reset_index()
    ingredients_agg = ingredients.groupby('product_id')['ingredient'].apply(list).reset_index()
    
    df = df.merge(notes_agg, on='product_id', how='left')
    df = df.merge(ingredients_agg, on='product_id', how='left')
    
    # Konvertieren der Preisinformationen
    df['price'] = df['product_standard_price'].str.replace('€', '').str.replace(',', '.').astype(float)
    
    return df

In [18]:
def engineer_features(df):
    # Feature Engineering
    df['note_count'] = df['note'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    df['ingredient_count'] = df['ingredient'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    # Kategorische Variablen encodieren
    le = LabelEncoder()
    df['gender_encoded'] = le.fit_transform(df['gender'])
    df['collection_encoded'] = le.fit_transform(df['collection_name'])
    
    return df

In [19]:
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Modellgenauigkeit ausgeben
    print(f"Model accuracy: {model.score(X_test, y_test)}")
    
    return model

In [20]:
def create_recommendation_system():
    df = prepare_data()
    df = engineer_features(df)
    
    features = ['price', 'gender_encoded', 'collection_encoded', 'note_count', 'ingredient_count']
    X = df[features]
    y = df['product_id']
    
    model = train_model(X, y)
    
    return model, df, features

In [21]:
def get_user_preferences(df):
    preferences = {}
    preferences['price'] = float(input("Gewünschter Preis (in Euro): "))
    
    gender = input("Geschlecht (Women/Men): ")
    le = LabelEncoder()
    preferences['gender_encoded'] = le.fit_transform([gender])[0]
    
    collection = input("Bevorzugte Kollektion: ")
    le = LabelEncoder()
    le.fit(df['collection_name'])
    preferences['collection_encoded'] = le.transform([collection])[0]
    
    preferences['note_count'] = int(input("Gewünschte Anzahl an Noten: "))
    preferences['ingredient_count'] = int(input("Gewünschte Anzahl an Zutaten: "))
    
    return preferences

In [22]:
def recommend_product(model, preferences, features, df):
    user_input = [preferences[feature] for feature in features]
    prediction = model.predict([user_input])[0]
    recommended_product = df[df['product_id'] == prediction].iloc[0]
    return recommended_product

In [25]:
model, df, features = create_recommendation_system()

user_prefs = get_user_preferences(df)
recommended_product = recommend_product(model, user_prefs, features, df)

Model accuracy: 0.0
Gewünschter Preis (in Euro): 49.9
Geschlecht (Women/Men): Women
Bevorzugte Kollektion: THE ICONIC COLLECTION
Gewünschte Anzahl an Noten: 3
Gewünschte Anzahl an Zutaten: 23




In [26]:
print(f"\nEmpfohlenes Produkt: {recommended_product['product_name']}")
print(f"Beschreibung: {recommended_product['description']}")
print(f"Preis: {recommended_product['product_standard_price']}")
print(f"Kollektion: {recommended_product['collection_name']}")
print(f"Geschlecht: {recommended_product['gender']}")


Empfohlenes Produkt: Voyage d'Ombre
Beschreibung: eau de parfum, 50 ml
Preis: €49,90
Kollektion: THE ICONIC COLLECTION
Geschlecht: Men
