In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

In [5]:
source_data = pd.read_csv('view_food_clean.csv')
source_data_copy = source_data.copy()
source_data_copy = source_data_copy[source_data_copy['merged_to'].isna()]
source_data_copy = source_data_copy[source_data_copy['name'].notna()]

  source_data = pd.read_csv('view_food_clean.csv')


In [7]:
TEXT_COLS = [
    'name', 'name_search', 'synonyms', 'brands', 'brands_search', 'bron', 'categories'
]

NUMERIC_COLS = [
    'energy', 'remarks_carbohydrates', 'protein', 'fat', 'saturated_fatty_acid', 'carbohydrates'
]

# Handle missing values by filling with a blank string
source_data_copy[TEXT_COLS] = source_data_copy[TEXT_COLS].fillna('')

# Concatenate the text columns into a single column
source_data_copy['concatenated_text'] = source_data_copy[TEXT_COLS].agg(' '.join, axis=1)

# Initialize and fit the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_vectors = tfidf_vectorizer.fit_transform(source_data_copy['concatenated_text'])

In [9]:
source_data_copy[NUMERIC_COLS] = source_data_copy[NUMERIC_COLS].fillna(0)

# Initialize and fit the StandardScaler
scaler = StandardScaler()
# Scale the numeric features
scaled_numeric_vectors = scaler.fit_transform(source_data_copy[NUMERIC_COLS])

from scipy.sparse import csr_matrix
combined_vectors = hstack([tfidf_vectors, csr_matrix(scaled_numeric_vectors)])

In [6]:
# settings that you use for count vectorizer will go here 
tfidf_vectorizer = TfidfVectorizer(use_idf=True) 

# just send in all your docs here 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(source_data_copy['name'])

# get the first vector out (for the first document) 
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0] 

In [7]:
dbscan = DBSCAN(eps=0.3, min_samples=3, metric='cosine') # Using cosine distance for better text vector comparison

# Fit DBSCAN on the TF-IDF matrix (one row per product) and save labels to product_text
labels = dbscan.fit_predict(tfidf_vectorizer_vectors)
source_data_copy['cluster_id'] = labels

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
nonly_noise = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters)
print("Estimated number of noise points: %d" % nonly_noise)

Estimated number of clusters: 874
Estimated number of noise points: 10361


In [8]:
source_data_copy

Unnamed: 0,id,name,name_search,active,energy,protein,fat,saturated_fatty_acid,carbohydrates,sugar,...,bron,user_id,deleted,categories,barcode,merged_to,created,updated,app_ver,cluster_id
0,24615,Dolce Gusto Lungo,dolce gusto lungo,1,0.0,0.0,0.0,0.0,0.0,0.0,...,NescafÃ©,,,dranken,7613031794331;5400113609087;8445290872203,,2023-01-01 00:00:00,2024-12-12 16:40:08,,-1
1,24616,Dolce Gusto espresso intenso,dolce gusto espresso intenso,1,0.0,0.0,0.0,0.0,0.0,0.0,...,NescafÃ©,,,dranken,7613036828444,,2023-01-01 00:00:00,2024-12-12 16:40:08,,0
2,24617,Dolce Gusto grande intenso,dolce gusto grande intenso,1,0.0,0.0,0.0,0.0,0.0,0.0,...,NescafÃ©,,,dranken,,,2023-01-01 00:00:00,2024-12-12 16:40:08,,0
3,24618,Dolce Gusto cappuccino,dolce gusto cappuccino,1,36.0,1.6,1.6,1.0,3.5,3.5,...,NescafÃ©,,,dranken,7613036306294;7613036303477,,2023-01-01 00:00:00,2025-03-12 11:52:28,,-1
4,24619,Dolce Gusto cappuccino ice,dolce gusto cappuccino ice,1,48.0,1.8,1.2,0.8,7.3,6.8,...,NescafÃ©,,,dranken,,,2023-01-01 00:00:00,2024-12-12 16:40:08,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17910,45480,Maiswafels Met Pizzasmaak En Kruiden,maiswafels met pizzasmaak en kruiden,1,415.0,7.4,8.7,0.8,75.0,2.8,...,,,,,8718907850094,,2025-09-27 22:56:05,2025-09-27 23:00:15,1.3.1,-1
17911,45481,ProteÃ¯ne meerzadenbrood,proteine meerzadenbrood,1,270.0,21.3,13.1,1.5,10.3,1.4,...,,,,aardappelen en graanproducten,5410724425211;05410724425112,,2025-09-27 23:11:03,2025-09-27 23:15:13,1.3.1,-1
17912,45482,Veggie sea salade,veggie sea salade,0,318.0,5.0,32.0,2.5,2.3,1.2,...,,,,,5413345008504,,2025-09-28 09:48:11,2025-09-28 09:48:11,1.3.1,-1
17913,45483,Naam niet gevonden,naam niet gevonden,0,,,,,,,...,,,,,5715008111133,,2025-09-28 19:43:28,2025-09-28 19:43:28,1.3.1,749
