In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import re

In [95]:
data = pd.read_json('data/cocktail_dataset.json')
data

Unnamed: 0,id,name,category,glass,tags,instructions,imageUrl,alcoholic,createdAt,updatedAt,ingredients
0,11000,Mojito,Cocktail,Highball glass,"[IBA, ContemporaryClassic, Alcoholic, USA, Asi...",Muddle mint leaves with sugar and lime juice. ...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:17.000+00:00,2024-08-18T19:06:16.000+00:00,"[{'id': 170, 'name': 'Soda water', 'descriptio..."
1,11001,Old Fashioned,Cocktail,Old-fashioned glass,"[IBA, Classic, Alcoholic, Expensive, Savory]",Place sugar cube in old fashioned glass and sa...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 513, 'name': 'Water', 'description': '..."
2,11002,Long Island Tea,Ordinary Drink,Highball glass,"[Strong, Asia, StrongFlavor, Brunch, Vegetaria...",Combine all ingredients (except cola) and pour...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 305, 'name': 'Light Rum', 'description..."
3,11003,Negroni,Ordinary Drink,Old-fashioned glass,"[IBA, Classic]","Stir into glass over ice, garnish and serve.",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 482, 'name': 'Sweet Vermouth', 'descri..."
4,11004,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"[IBA, Classic, Alcoholic, ContemporaryClassic]","Shake with ice. Strain into chilled glass, gar...",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:59.000+00:00,2024-08-18T19:06:18.000+00:00,"[{'id': 409, 'name': 'Powdered Sugar', 'descri..."
...,...,...,...,...,...,...,...,...,...,...,...
129,11985,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,,"Stir all ingredients with ice, strain into a c...",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:11:43.000+00:00,2024-08-18T19:11:43.000+00:00,"[{'id': 305, 'name': 'Light Rum', 'description..."
130,11987,Queen Bee,Ordinary Drink,Cocktail glass,,"Shake all ingredients with ice, strain into a ...",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:11:44.000+00:00,2024-08-18T19:11:44.000+00:00,"[{'id': 137, 'name': 'Coffee Brandy', 'descrip..."
131,11989,Queen Charlotte,Ordinary Drink,Collins glass,,Pour red wine and grenadine into a collins gla...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:11:45.000+00:00,2024-08-18T19:11:45.000+00:00,"[{'id': 250, 'name': 'Grenadine', 'description..."
132,11991,Queen Elizabeth,Ordinary Drink,Cocktail glass,,"Stir all ingredients with ice, strain into a c...",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:11:45.000+00:00,2024-08-18T19:11:45.000+00:00,"[{'id': 2, 'name': 'Gin', 'description': 'Gin ..."


# Dropping unecessary columns

In [96]:
def drop_unecessary_columns(data: pd.DataFrame, columns: list) -> pd.DataFrame:
    return data.drop(columns=columns)

In [97]:
data = drop_unecessary_columns(data, ['imageUrl', 'createdAt', 'updatedAt', 'id'])
data

Unnamed: 0,name,category,glass,tags,instructions,alcoholic,ingredients
0,Mojito,Cocktail,Highball glass,"[IBA, ContemporaryClassic, Alcoholic, USA, Asi...",Muddle mint leaves with sugar and lime juice. ...,1,"[{'id': 170, 'name': 'Soda water', 'descriptio..."
1,Old Fashioned,Cocktail,Old-fashioned glass,"[IBA, Classic, Alcoholic, Expensive, Savory]",Place sugar cube in old fashioned glass and sa...,1,"[{'id': 513, 'name': 'Water', 'description': '..."
2,Long Island Tea,Ordinary Drink,Highball glass,"[Strong, Asia, StrongFlavor, Brunch, Vegetaria...",Combine all ingredients (except cola) and pour...,1,"[{'id': 305, 'name': 'Light Rum', 'description..."
3,Negroni,Ordinary Drink,Old-fashioned glass,"[IBA, Classic]","Stir into glass over ice, garnish and serve.",1,"[{'id': 482, 'name': 'Sweet Vermouth', 'descri..."
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"[IBA, Classic, Alcoholic, ContemporaryClassic]","Shake with ice. Strain into chilled glass, gar...",1,"[{'id': 409, 'name': 'Powdered Sugar', 'descri..."
...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,,"Stir all ingredients with ice, strain into a c...",1,"[{'id': 305, 'name': 'Light Rum', 'description..."
130,Queen Bee,Ordinary Drink,Cocktail glass,,"Shake all ingredients with ice, strain into a ...",1,"[{'id': 137, 'name': 'Coffee Brandy', 'descrip..."
131,Queen Charlotte,Ordinary Drink,Collins glass,,Pour red wine and grenadine into a collins gla...,1,"[{'id': 250, 'name': 'Grenadine', 'description..."
132,Queen Elizabeth,Ordinary Drink,Cocktail glass,,"Stir all ingredients with ice, strain into a c...",1,"[{'id': 2, 'name': 'Gin', 'description': 'Gin ..."


# Cleaning specific columns

In [None]:
# here I am going to use vectors than normalized to keep dimensions at low number (max should around 13 rows)

## 'tags'

In [98]:
data[~data['tags'].isna()]

Unnamed: 0,name,category,glass,tags,instructions,alcoholic,ingredients
0,Mojito,Cocktail,Highball glass,"[IBA, ContemporaryClassic, Alcoholic, USA, Asi...",Muddle mint leaves with sugar and lime juice. ...,1,"[{'id': 170, 'name': 'Soda water', 'descriptio..."
1,Old Fashioned,Cocktail,Old-fashioned glass,"[IBA, Classic, Alcoholic, Expensive, Savory]",Place sugar cube in old fashioned glass and sa...,1,"[{'id': 513, 'name': 'Water', 'description': '..."
2,Long Island Tea,Ordinary Drink,Highball glass,"[Strong, Asia, StrongFlavor, Brunch, Vegetaria...",Combine all ingredients (except cola) and pour...,1,"[{'id': 305, 'name': 'Light Rum', 'description..."
3,Negroni,Ordinary Drink,Old-fashioned glass,"[IBA, Classic]","Stir into glass over ice, garnish and serve.",1,"[{'id': 482, 'name': 'Sweet Vermouth', 'descri..."
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"[IBA, Classic, Alcoholic, ContemporaryClassic]","Shake with ice. Strain into chilled glass, gar...",1,"[{'id': 409, 'name': 'Powdered Sugar', 'descri..."
5,Dry Martini,Cocktail,Cocktail glass,"[IBA, Classic, Christmas, Alcoholic]",Straight: Pour all ingredients into mixing gla...,1,"[{'id': 189, 'name': 'Dry Vermouth', 'descript..."
6,Daiquiri,Ordinary Drink,Cocktail glass,"[IBA, Classic, Beach]",Pour all ingredients into shaker with ice cube...,1,"[{'id': 305, 'name': 'Light Rum', 'description..."
7,Margarita,Ordinary Drink,Cocktail glass,"[IBA, ContemporaryClassic]",Rub the rim of the glass with the lime slice t...,1,"[{'id': 4, 'name': 'Tequila', 'description': '..."
8,Manhattan,Cocktail,Cocktail glass,"[IBA, Classic, Alcoholic]","Stirred over ice, strained into a chilled glas...",1,"[{'id': 20, 'name': 'Angostura Bitters', 'desc..."
9,Moscow Mule,Punch / Party Drink,Copper Mug,"[IBA, ContemporaryClassic]",Combine vodka and ginger beer in a highball gl...,1,"[{'id': 1, 'name': 'Vodka', 'description': 'Vo..."


In [99]:
# Unpacking the 'tags' column 
tags = set([value for sublist in data['tags'] if sublist is not None for value in sublist])

tags

{'Alcoholic',
 'Asia',
 'Beach',
 'Breakfast',
 'Brunch',
 'Chilli',
 'Christmas',
 'Citrus',
 'Classic',
 'Cold',
 'ContemporaryClassic',
 'Dairy',
 'DinnerParty',
 'Expensive',
 'Fruity',
 'Hangover',
 'IBA',
 'Mild',
 'NewEra',
 'Nutty',
 'Savory',
 'Sour',
 'Strong',
 'StrongFlavor',
 'Summer',
 'USA',
 'Vegan',
 'Vegetarian'}

In [100]:
def add_tag_vector_column(df: pd.DataFrame, tag_column: str) -> pd.DataFrame:

    # Extract unique tags from all rows in the tag column
    unique_tags = sorted(set(tag for tags_list in df[tag_column].dropna() for tag in tags_list))
    
    # Function to generate the vector for each row
    def tag_vector(tags):
        return [1 if tag in tags else 0 for tag in unique_tags]
    
    # Apply the function to create a new column with the tag vector
    df['tag_vector'] = df[tag_column].apply(lambda tags: tag_vector(tags) if tags else [0] * len(unique_tags))
    df.drop(columns= tag_column, inplace=True)
    
    return df  


data = add_tag_vector_column(data, 'tags')

data

Unnamed: 0,name,category,glass,instructions,alcoholic,ingredients,tag_vector
0,Mojito,Cocktail,Highball glass,Muddle mint leaves with sugar and lime juice. ...,1,"[{'id': 170, 'name': 'Soda water', 'descriptio...","[1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ..."
1,Old Fashioned,Cocktail,Old-fashioned glass,Place sugar cube in old fashioned glass and sa...,1,"[{'id': 513, 'name': 'Water', 'description': '...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ..."
2,Long Island Tea,Ordinary Drink,Highball glass,Combine all ingredients (except cola) and pour...,1,"[{'id': 305, 'name': 'Light Rum', 'description...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Negroni,Ordinary Drink,Old-fashioned glass,"Stir into glass over ice, garnish and serve.",1,"[{'id': 482, 'name': 'Sweet Vermouth', 'descri...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"Shake with ice. Strain into chilled glass, gar...",1,"[{'id': 409, 'name': 'Powdered Sugar', 'descri...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,"Stir all ingredients with ice, strain into a c...",1,"[{'id': 305, 'name': 'Light Rum', 'description...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
130,Queen Bee,Ordinary Drink,Cocktail glass,"Shake all ingredients with ice, strain into a ...",1,"[{'id': 137, 'name': 'Coffee Brandy', 'descrip...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
131,Queen Charlotte,Ordinary Drink,Collins glass,Pour red wine and grenadine into a collins gla...,1,"[{'id': 250, 'name': 'Grenadine', 'description...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
132,Queen Elizabeth,Ordinary Drink,Cocktail glass,"Stir all ingredients with ice, strain into a c...",1,"[{'id': 2, 'name': 'Gin', 'description': 'Gin ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 'ingredients'

In [101]:
data['ingredients'][0]

[{'id': 170,
  'name': 'Soda water',
  'description': None,
  'alcohol': 1,
  'type': None,
  'percentage': None,
  'imageUrl': None,
  'createdAt': '2024-08-18T19:01:57.000+00:00',
  'updatedAt': '2024-08-18T19:01:57.000+00:00'},
 {'id': 305,
  'name': 'Light Rum',
  'description': 'Light rums, also referred to as "silver" or "white" rums, in general, have very little flavor aside from a general sweetness. Light rums are sometimes filtered after aging to remove any colour. The majority of light rums come from Puerto Rico. Their milder flavors make them popular for use in mixed drinks, as opposed to drinking them straight. Light rums are included in some of the most popular cocktails including the Mojito and the Daiquiri.',
  'alcohol': 1,
  'type': 'Rum',
  'percentage': None,
  'imageUrl': 'https://cocktails.solvro.pl/images/ingredients/light-rum.png',
  'createdAt': '2024-08-18T19:02:37.000+00:00',
  'updatedAt': '2024-08-18T19:02:37.000+00:00',
  'measure': '2-3 oz '},
 {'id': 312,

In [102]:
def add_ingredient_vector_column(df: pd.DataFrame, ingredient_column: str) -> pd.DataFrame:
   
    # Extract unique ingredient names from all rows in the ingredient column
    unique_ingredients = set(
        ingredient['name'] for ingredients_list in df[ingredient_column].dropna() 
        for ingredient in ingredients_list
    )
    unique_ingredients = sorted(unique_ingredients)  # Ensure consistent ordering
    
    # Function to generate the vector for each row
    def ingredient_vector(ingredients):
        ingredient_names = [ingredient['name'] for ingredient in ingredients]
        return [1 if ingredient in ingredient_names else 0 for ingredient in unique_ingredients]
    
    # Apply the function to create a new column with the ingredient vector
    df['ingredient_vector'] = df[ingredient_column].apply(lambda ingredients: ingredient_vector(ingredients) if ingredients else [0] * len(unique_ingredients))
    df.drop(columns= ingredient_column, inplace=True)
    return df


# Apply the function to add the ingredient vector column
data = add_ingredient_vector_column(data, 'ingredients')

data

Unnamed: 0,name,category,glass,instructions,alcoholic,tag_vector,ingredient_vector
0,Mojito,Cocktail,Highball glass,Muddle mint leaves with sugar and lime juice. ...,1,"[1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Old Fashioned,Cocktail,Old-fashioned glass,Place sugar cube in old fashioned glass and sa...,1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,Long Island Tea,Ordinary Drink,Highball glass,Combine all ingredients (except cola) and pour...,1,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Negroni,Ordinary Drink,Old-fashioned glass,"Stir into glass over ice, garnish and serve.",1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"Shake with ice. Strain into chilled glass, gar...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,"Stir all ingredients with ice, strain into a c...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
130,Queen Bee,Ordinary Drink,Cocktail glass,"Shake all ingredients with ice, strain into a ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
131,Queen Charlotte,Ordinary Drink,Collins glass,Pour red wine and grenadine into a collins gla...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
132,Queen Elizabeth,Ordinary Drink,Cocktail glass,"Stir all ingredients with ice, strain into a c...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


## 'instructions'

In [103]:
def add_process_vector_column(df: pd.DataFrame, instruction_column: str) -> pd.DataFrame:
    
    # domain-based processes in bartending
    common_cocktail_processes = [
        'shake', 'stir', 'muddle', 'strain', 'blend', 'pour', 'build', 'layer',
        'rim', 'garnish', 'fill', 'squeeze', 'top', 'mix', 'flame', 'crush', 
        'dilute', 'press', 'double strain', 'dry shake', 'whip', 'float', 
        'swizzle', 'infuse', 'zest'
    ]

    # Function to extract processes from the instructions
    def extract_processes_from_instructions(instruction, processes_list):
        # Extract processes (simple case-insensitive match for known processes)
        processes_found = [process for process in processes_list if re.search(rf'\b{process.lower()}\b', instruction.lower())]
        return processes_found

    # Apply the function to extract processes
    df['processes_in_instructions'] = df[instruction_column].apply(lambda x: extract_processes_from_instructions(x, common_cocktail_processes))

    # Get unique processes found across all instructions
    unique_processes = sorted(set(process for sublist in df['processes_in_instructions'] for process in sublist))

    # Function to create a binary vector for processes
    def process_vector(processes, all_processes):
        return [1 if process in processes else 0 for process in all_processes]

    # Add the vectorized process column to the DataFrame
    df['process_vector'] = df['processes_in_instructions'].apply(lambda processes: process_vector(processes, unique_processes))
    df.drop(columns= [instruction_column, 'processes_in_instructions'], inplace=True)
    return df

# Apply the function to add the process vector column
data = add_process_vector_column(data, 'instructions')

data



Unnamed: 0,name,category,glass,alcoholic,tag_vector,ingredient_vector,process_vector
0,Mojito,Cocktail,Highball glass,1,"[1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]"
1,Old Fashioned,Cocktail,Old-fashioned glass,1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Long Island Tea,Ordinary Drink,Highball glass,1,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
3,Negroni,Ordinary Drink,Old-fashioned glass,1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]"
...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]"
130,Queen Bee,Ordinary Drink,Cocktail glass,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]"
131,Queen Charlotte,Ordinary Drink,Collins glass,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
132,Queen Elizabeth,Ordinary Drink,Cocktail glass,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]"


## normalize

In [104]:
def normalize_vectors(df: pd.DataFrame, vector_columns: list) -> pd.DataFrame:
       
        def vector_norm(vectors: list) -> float:
            #Calculate the norm (magnitude) of the vector
            return np.linalg.norm(vectors)
        
        # Loop through each vector column and compute the norm
        for col in vector_columns:
            df[f'{col}_norm'] = df[col].apply(vector_norm)
        
        df = df.drop(columns=vector_columns)
        
        return df

In [105]:
data = normalize_vectors(data, ['ingredient_vector', 'tag_vector', 'process_vector'])

# Standard preprocessing rest of columns

In [106]:
data

Unnamed: 0,name,category,glass,alcoholic,ingredient_vector_norm,tag_vector_norm,process_vector_norm
0,Mojito,Cocktail,Highball glass,1,2.236068,3.162278,2.236068
1,Old Fashioned,Cocktail,Old-fashioned glass,1,2.000000,2.236068,1.732051
2,Long Island Tea,Ordinary Drink,Highball glass,1,2.449490,2.449490,1.000000
3,Negroni,Ordinary Drink,Old-fashioned glass,1,1.732051,1.414214,1.414214
4,Whiskey Sour,Ordinary Drink,Old-fashioned glass,1,2.000000,2.000000,1.732051
...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Ordinary Drink,Cocktail glass,1,1.732051,0.000000,1.414214
130,Queen Bee,Ordinary Drink,Cocktail glass,1,1.732051,0.000000,1.414214
131,Queen Charlotte,Ordinary Drink,Collins glass,1,1.732051,0.000000,1.732051
132,Queen Elizabeth,Ordinary Drink,Cocktail glass,1,1.732051,0.000000,1.414214


## One Hot Encoding

In [107]:
def OHE_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    # Initialize OneHotEncoder
    ohe = OneHotEncoder(sparse_output=False, drop = 'if_binary', handle_unknown = 'infrequent_if_exist')  
    
    # Apply OHE to the specified columns
    ohe_encoded = ohe.fit_transform(df[columns])
    
    # Convert the result to a DataFrame with appropriate column names
    ohe_encoded_df = pd.DataFrame(ohe_encoded, columns=ohe.get_feature_names_out(columns))
    
    # Drop the original columns that were encoded and concatenate the encoded columns
    df = df.drop(columns, axis=1)
    df = pd.concat([df, ohe_encoded_df], axis=1)
    
    return df

# Apply the function to encode the columns
data = OHE_columns(data, ['category', 'alcoholic'])

data


Unnamed: 0,name,glass,ingredient_vector_norm,tag_vector_norm,process_vector_norm,category_Cocktail,category_Ordinary Drink,category_Punch / Party Drink,alcoholic_1
0,Mojito,Highball glass,2.236068,3.162278,2.236068,1.0,0.0,0.0,1.0
1,Old Fashioned,Old-fashioned glass,2.000000,2.236068,1.732051,1.0,0.0,0.0,1.0
2,Long Island Tea,Highball glass,2.449490,2.449490,1.000000,0.0,1.0,0.0,1.0
3,Negroni,Old-fashioned glass,1.732051,1.414214,1.414214,0.0,1.0,0.0,1.0
4,Whiskey Sour,Old-fashioned glass,2.000000,2.000000,1.732051,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,Cocktail glass,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0
130,Queen Bee,Cocktail glass,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0
131,Queen Charlotte,Collins glass,1.732051,0.000000,1.732051,0.0,1.0,0.0,1.0
132,Queen Elizabeth,Cocktail glass,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0


In [108]:
def LE_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:

    # Dictionary to store the LabelEncoders for each column
    label_encoders = {}
    
    for col in columns:
        le = LabelEncoder()
        # Fit the LabelEncoder on the column and transform the values
        df[col] = le.fit_transform(df[col].astype(str))  # Converting to string to handle non-string types
        label_encoders[col] = le  
    
    return df

# Apply the function to label encode the columns
data = label_encode_columns(data, ['glass'])

data


Unnamed: 0,name,glass,ingredient_vector_norm,tag_vector_norm,process_vector_norm,category_Cocktail,category_Ordinary Drink,category_Punch / Party Drink,alcoholic_1
0,Mojito,5,2.236068,3.162278,2.236068,1.0,0.0,0.0,1.0
1,Old Fashioned,6,2.000000,2.236068,1.732051,1.0,0.0,0.0,1.0
2,Long Island Tea,5,2.449490,2.449490,1.000000,0.0,1.0,0.0,1.0
3,Negroni,6,1.732051,1.414214,1.414214,0.0,1.0,0.0,1.0
4,Whiskey Sour,6,2.000000,2.000000,1.732051,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
129,Quarter Deck Cocktail,2,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0
130,Queen Bee,2,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0
131,Queen Charlotte,3,1.732051,0.000000,1.732051,0.0,1.0,0.0,1.0
132,Queen Elizabeth,2,1.732051,0.000000,1.414214,0.0,1.0,0.0,1.0
