# explanation of dataset

labels: 
<br> 0: vegan
<br> 1: vegetarian
<br> 2: classic (neither vegetarian nor vegan label)

# settings

In [None]:
# set category of considered categories 
considered_categories = [0, 1, 2] # can be: [0,1,2] = ['vegan', 'vegetarian', 'classic']

# set number of considered recipes 
number_of_recipes = None # if whole dataset is to be used insert None
samples_per_class = 2000 # if max number of samples insert None

# imports

In [None]:
# imports
import pandas as pd
import numpy as np
import os
import ast
import re

from difflib import SequenceMatcher

from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# read source data
dir = os.getcwd()
df = pd.read_csv(dir + r'\data\recipes_w_search_terms.csv')

# preprocessing

## add labels 

In [None]:
# add labels by search terms in dataset
df = df.assign(label=[0 if 'vegan' in x else 1 if 'vegetarian' in x else 2 for x in df.search_terms])

## safe labelled dataset to file

In [None]:
df.to_csv(dir + r'\data\recipes_w_search_terms_labelled.csv', index = False)

## shorten and filter dataset

In [None]:
# shorten dataset by manual max leng
if  number_of_recipes != None:
    df = df[:number_of_recipes]

# filter for necessary columns
df = df[['ingredients','label']]

# filter labels (if not all categories shall be included)
df = df[df.label.isin(considered_categories)]

## eval distribution of classes

In [None]:
# create seperated dataframes of different classes
vegan_df = df[df.label == 0]
vegetarian_df = df[df.label == 1]
classic_df = df[df.label == 2]

In [None]:
# check if manual number of samples per class inserted. If not, use min number of samples per class 
if samples_per_class == None:
    samples_per_class = min_samples = min(vegan_df.shape[0], vegetarian_df.shape[0], classic_df.shape[0])

# shuffle dataset and use n numbers per class
vegan_df = vegan_df.sample(frac = 1)[0:samples_per_class]
vegetarian_df = vegetarian_df.sample(frac = 1)[0:samples_per_class]
classic_df = classic_df.sample(frac = 1)[0:samples_per_class]

# concatenate dataframes 
df = pd.concat([vegan_df, vegetarian_df, classic_df], ignore_index = True)

## preprocess ingredients names

In [None]:
# transform column "ingredients" to list 
df = df.assign(ingredients = [ast.literal_eval(x) for x in df.ingredients])

In [None]:
# remove all words in ingredients which include numbers
df = df.assign(ingredients = [[(''.join([x + ' ' for x in x_sent.split() if not bool(re.search(r'\d', x))]).strip()) for x_sent in x_list] for x_list in df.ingredients])

## one hot encoding and split labels and features
if no split --> risk of error: column 'labels' can be deleted while merging columns with similar names

In [None]:
# one hot encoding FEATURES
# load MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)

# execute one hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop('ingredients')),
        index = df.index, 
        columns = mlb.classes_
    )
)

# drop labels to get features dataframe
df_features = df.drop(df.columns[[0]], axis = 1)

In [None]:
# get labels dataframe
df_labels = df['label']

# one hot encoding LABELS
df_labels = pd.get_dummies(df_labels)

## merge columns with similar names 

In [None]:
df_features.columns[0:100]

In [None]:
# safe number of features before merging
before_merge = len(df_features.columns)

In [None]:
# if two column names have a similarity of more than 90%, they are renamed the same
similar_names = {}
ingredients = []
old_ingredients = [] # add ingredients one after another to this list so that new ingredients are only compared to those for runtime optimization
for column in df_features.columns:
    # if any(SequenceMatcher(None, ing, column).ratio() > 0.9 for ing in old_ingredients):
    for ing in old_ingredients:
        if SequenceMatcher(None, ing, column).ratio() > 0.9:
            df_features.rename({column: ing}, axis=1, inplace = True) 
            similar_names[column] = ing
    else:
        ingredients.append(column)
    old_ingredients.append(column)
        
# merge columns with same name
df_features = df_features.groupby(level = 0, axis = 1).sum()

# print similar names
similar_names

In [None]:
print('number of features before merge:', before_merge, '\nnumber of features after merge: ', len(df_features.columns))

# safe preprocessed df to file

In [None]:
df_features.to_csv(dir + r'\data\df_features.csv', index = False)
df_labels.to_csv(dir + r'\data\df_labels.csv', index = False)