In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import plotly.express as px

#Libraries for preprocessing
from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import webcolors

#Download once if using NLTK for preprocessing
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from fuzzywuzzy import fuzz

#Libraries for clustering
from sklearn.cluster import KMeans

[nltk_data] Downloading package punkt to /Users/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('view_food_clean.csv')

  df = pd.read_csv('view_food_clean.csv')


In [3]:
TEXT_COLS = [
    'name', 'name_search', 'remarks', 'synonyms', 'brands', 'brands_search', 'bron', 'categories'
]

In [4]:
# create cleaned dataframe df_2 from the columns listed in TEXT_COLS
translator_punct = str.maketrans('', '', string.punctuation)
translator_digits = str.maketrans('', '', string.digits)

def clean_text(x):
    if pd.isna(x):
        return ''
    s = str(x)
    s = remove_stopwords(s)
    s = s.translate(translator_punct)
    s = s.translate(translator_digits)
    return s.strip()

cols = [c for c in TEXT_COLS if c in df.columns]
df_2 = df[cols].applymap(clean_text)

df_2.head()

  df_2 = df[cols].applymap(clean_text)


Unnamed: 0,name,name_search,remarks,synonyms,brands,brands_search,bron,categories
0,Dolce Gusto Lungo,dolce gusto lungo,Dolce Gusto Lungo,koffie,NescafÃ©,nescafe,NescafÃ©,dranken
1,Dolce Gusto espresso intenso,dolce gusto espresso intenso,Dolce Gusto espresso intenso,espresso koffie,NescafÃ©,nescafe,NescafÃ©,dranken
2,Dolce Gusto grande intenso,dolce gusto grande intenso,Dolce Gusto grande intenso,koffie,NescafÃ©,nescafe,NescafÃ©,dranken
3,Dolce Gusto cappuccino,dolce gusto cappuccino,Dolce Gusto cappuccino,koffie koffie met melk,NescafÃ©,nescafe,NescafÃ©,dranken
4,Dolce Gusto cappuccino ice,dolce gusto cappuccino ice,Dolce Gusto cappuccino ice,ijskoffie koffie,NescafÃ©,nescafe,NescafÃ©,dranken


In [5]:
# create df_3 by stemming text columns from df_2
df_3 = df_2.copy()

porter = PorterStemmer()

def stemSentence(sentence):
    if pd.isna(sentence) or sentence == '':
        return ''
    s = str(sentence)
    token_words = word_tokenize(s)
    stem_sentence = [porter.stem(word) for word in token_words]
    return ' '.join(stem_sentence)

cols_to_stem = [c for c in TEXT_COLS if c in df_3.columns]
df_3[cols_to_stem] = df_3[cols_to_stem].applymap(stemSentence)

df_3.head()

  df_3[cols_to_stem] = df_3[cols_to_stem].applymap(stemSentence)


Unnamed: 0,name,name_search,remarks,synonyms,brands,brands_search,bron,categories
0,dolc gusto lungo,dolc gusto lungo,dolc gusto lungo,koffi,nescafã©,nescaf,nescafã©,dranken
1,dolc gusto espresso intenso,dolc gusto espresso intenso,dolc gusto espresso intenso,espresso koffi,nescafã©,nescaf,nescafã©,dranken
2,dolc gusto grand intenso,dolc gusto grand intenso,dolc gusto grand intenso,koffi,nescafã©,nescaf,nescafã©,dranken
3,dolc gusto cappuccino,dolc gusto cappuccino,dolc gusto cappuccino,koffi koffi met melk,nescafã©,nescaf,nescafã©,dranken
4,dolc gusto cappuccino ice,dolc gusto cappuccino ice,dolc gusto cappuccino ice,ijskoffi koffi,nescafã©,nescaf,nescafã©,dranken


In [12]:
import re
from IPython.display import display

# find rows (in original df and in stemmed df_3) that contain non-ASCII / special characters

cols = [c for c in TEXT_COLS if c in df.columns]
non_ascii_re = r'[^\x00-\x7F]'

mask_df = df[cols].apply(lambda s: s.astype(str).str.contains(non_ascii_re, regex=True, na=False)).any(axis=1)
df_nonascii = df.loc[mask_df, cols + (['id'] if 'id' in df.columns else [])]

mask_df3 = df_3[cols].apply(lambda s: s.astype(str).str.contains(non_ascii_re, regex=True, na=False)).any(axis=1)
df3_nonascii = df_3.loc[mask_df3, cols]

print(f"Rows in df with non-ASCII characters: {len(df_nonascii)}")
display(df_nonascii)

print(f"\nRows in df_3 with non-ASCII characters: {len(df3_nonascii)}")
display(df3_nonascii)

Rows in df with non-ASCII characters: 5584


Unnamed: 0,name,name_search,remarks,synonyms,brands,brands_search,bron,categories,id
0,Dolce Gusto Lungo,dolce gusto lungo,Dolce Gusto Lungo,koffie,NescafÃ©,nescafe,NescafÃ©,dranken,24615
1,Dolce Gusto espresso intenso,dolce gusto espresso intenso,Dolce Gusto espresso intenso,"espresso, koffie",NescafÃ©,nescafe,NescafÃ©,dranken,24616
2,Dolce Gusto grande intenso,dolce gusto grande intenso,Dolce Gusto grande intenso,koffie,NescafÃ©,nescafe,NescafÃ©,dranken,24617
3,Dolce Gusto cappuccino,dolce gusto cappuccino,Dolce Gusto cappuccino,"koffie, koffie met melk",NescafÃ©,nescafe,NescafÃ©,dranken,24618
4,Dolce Gusto cappuccino ice,dolce gusto cappuccino ice,Dolce Gusto cappuccino ice,"ijskoffie, koffie",NescafÃ©,nescafe,NescafÃ©,dranken,24619
...,...,...,...,...,...,...,...,...,...
17906,MaÃ¯swafels dun sour cream and union flavour (AH),maiswafels dun sour cream and union flavour (ah),Galettes de mais Ã la crÃ¨me fraÃ®che et Ã l...,"cracker, galettes de mais, maiswafel",Albert Heijn,albert heijn,,aardappelen en graanproducten,45476
17907,AÃ¯ki rice chicken teriyaki (cup),aiki rice chicken teriyaki (cup),AÃ¯ki rice chicken teriyaki (cup),,AÃ¯ki,aiki,,bereid gerecht,45477
17908,Aiki noodles Hot & Spicy XL size,aiki noodles hot & spicy xl size,Aiki noodles Hot & Spicy XL size,"noodle snack, noodles",AÃ¯ki,aiki,,bereid gerecht,45478
17909,Kaas geraspt Emmental,kaas geraspt emmental,Fromage Emmental rÃ¢pÃ©,"emmental, emmentaler, fromage rape, gemalen ka...",Spar,spar,,melkproducten en vervangers,45479



Rows in df_3 with non-ASCII characters: 5584


Unnamed: 0,name,name_search,remarks,synonyms,brands,brands_search,bron,categories
0,dolc gusto lungo,dolc gusto lungo,dolc gusto lungo,koffi,nescafã©,nescaf,nescafã©,dranken
1,dolc gusto espresso intenso,dolc gusto espresso intenso,dolc gusto espresso intenso,espresso koffi,nescafã©,nescaf,nescafã©,dranken
2,dolc gusto grand intenso,dolc gusto grand intenso,dolc gusto grand intenso,koffi,nescafã©,nescaf,nescafã©,dranken
3,dolc gusto cappuccino,dolc gusto cappuccino,dolc gusto cappuccino,koffi koffi met melk,nescafã©,nescaf,nescafã©,dranken
4,dolc gusto cappuccino ice,dolc gusto cappuccino ice,dolc gusto cappuccino ice,ijskoffi koffi,nescafã©,nescaf,nescafã©,dranken
...,...,...,...,...,...,...,...,...
17906,maã¯swafel dun sour cream union flavour ah,maiswafel dun sour cream union flavour ah,galett mai ã la crã¨me fraã®ch et ã loignon ah,cracker galett mai maiswafel,albert heijn,albert heijn,,aardappelen en graanproducten
17907,aã¯ki rice chicken teriyaki cup,aiki rice chicken teriyaki cup,aã¯ki rice chicken teriyaki cup,,aã¯ki,aiki,,bereid gerecht
17908,aiki noodl hot spici xl size,aiki noodl hot spici xl size,aiki noodl hot spici xl size,noodl snack noodl,aã¯ki,aiki,,bereid gerecht
17909,kaa geraspt emment,kaa geraspt emment,fromag emment rã¢pã©,emment emmental fromag rape gemalen kaa gerasp...,spar,spar,,melkproducten en vervang
