In [30]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
#import seaborn as sns

#Configure max columns display
pd.set_option('max_columns', 200)

df = pd.read_json(path_or_buf='cocktail_dataset.json')

In [33]:
df.columns

Index(['id', 'name', 'category', 'glass', 'tags', 'instructions', 'imageUrl',
       'alcoholic', 'createdAt', 'updatedAt', 'ingredients'],
      dtype='object')

In [34]:
df.head()

Unnamed: 0,id,name,category,glass,tags,instructions,imageUrl,alcoholic,createdAt,updatedAt,ingredients
0,11000,Mojito,Cocktail,Highball glass,"[IBA, ContemporaryClassic, Alcoholic, USA, Asi...",Muddle mint leaves with sugar and lime juice. ...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:17.000+00:00,2024-08-18T19:06:16.000+00:00,"[{'id': 170, 'name': 'Soda water', 'descriptio..."
1,11001,Old Fashioned,Cocktail,Old-fashioned glass,"[IBA, Classic, Alcoholic, Expensive, Savory]",Place sugar cube in old fashioned glass and sa...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 513, 'name': 'Water', 'description': '..."
2,11002,Long Island Tea,Ordinary Drink,Highball glass,"[Strong, Asia, StrongFlavor, Brunch, Vegetaria...",Combine all ingredients (except cola) and pour...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 305, 'name': 'Light Rum', 'description..."
3,11003,Negroni,Ordinary Drink,Old-fashioned glass,"[IBA, Classic]","Stir into glass over ice, garnish and serve.",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 482, 'name': 'Sweet Vermouth', 'descri..."
4,11004,Whiskey Sour,Ordinary Drink,Old-fashioned glass,"[IBA, Classic, Alcoholic, ContemporaryClassic]","Shake with ice. Strain into chilled glass, gar...",https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:59.000+00:00,2024-08-18T19:06:18.000+00:00,"[{'id': 409, 'name': 'Powdered Sugar', 'descri..."


In [36]:
df.dtypes

id               int64
name            object
category        object
glass           object
tags            object
instructions    object
imageUrl        object
alcoholic        int64
createdAt       object
updatedAt       object
ingredients     object
dtype: object

In [37]:
df.describe()

Unnamed: 0,id,alcoholic
count,134.0,134.0
mean,11344.955224,1.0
std,306.506605,0.0
min,11000.0,1.0
25%,11056.25,1.0
50%,11304.0,1.0
75%,11538.5,1.0
max,11993.0,1.0


### DATA PREPARATION

In [38]:
df.head(2)

Unnamed: 0,id,name,category,glass,tags,instructions,imageUrl,alcoholic,createdAt,updatedAt,ingredients
0,11000,Mojito,Cocktail,Highball glass,"[IBA, ContemporaryClassic, Alcoholic, USA, Asi...",Muddle mint leaves with sugar and lime juice. ...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:17.000+00:00,2024-08-18T19:06:16.000+00:00,"[{'id': 170, 'name': 'Soda water', 'descriptio..."
1,11001,Old Fashioned,Cocktail,Old-fashioned glass,"[IBA, Classic, Alcoholic, Expensive, Savory]",Place sugar cube in old fashioned glass and sa...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18T19:01:58.000+00:00,2024-08-18T19:06:17.000+00:00,"[{'id': 513, 'name': 'Water', 'description': '..."


In [44]:
#Correction of type of columns

df['createdAt'] = pd.to_datetime(df['createdAt'])
df['updatedAt'] = pd.to_datetime(df['updatedAt'])

df.dtypes

id                            int64
name                         object
category                     object
glass                        object
tags                         object
instructions                 object
imageUrl                     object
alcoholic                     int64
createdAt       datetime64[ns, UTC]
updatedAt       datetime64[ns, UTC]
ingredients                  object
dtype: object

In [55]:
#Checking rows with missing information
df.isna().sum()

id               0
name             0
category         0
glass            0
tags            99
instructions     0
imageUrl         0
alcoholic        0
createdAt        0
updatedAt        0
ingredients      0
dtype: int64

In [56]:
df.shape

(134, 11)

In [62]:
# 99/134 ~= 74% of tags column is missing so we drop this column

df = df.drop('tags', axis=1)

df.head(1)

Unnamed: 0,id,name,category,glass,instructions,imageUrl,alcoholic,createdAt,updatedAt,ingredients
0,11000,Mojito,Cocktail,Highball glass,Muddle mint leaves with sugar and lime juice. ...,https://cocktails.solvro.pl/images/ingredients...,1,2024-08-18 19:01:17+00:00,2024-08-18 19:06:16+00:00,"[{'id': 170, 'name': 'Soda water', 'descriptio..."


In [67]:
#Replacing alcoholic metric with true false value

df.loc[:, 'alcoholic'].apply(lambda a: True if a else False)

0      True
1      True
2      True
3      True
4      True
       ... 
129    True
130    True
131    True
132    True
133    True
Name: alcoholic, Length: 134, dtype: bool

In [69]:
df.loc[:, 'alcoholic'].sum()

134

In [None]:
#Every drink is alcoholic so we can drop this column as well as it does not provide us any additional information

df = df.drop('alcoholic', axis=1)

In [76]:
df.loc[:, 'ingredients']

0      [{'id': 170, 'name': 'Soda water', 'descriptio...
1      [{'id': 513, 'name': 'Water', 'description': '...
2      [{'id': 305, 'name': 'Light Rum', 'description...
3      [{'id': 482, 'name': 'Sweet Vermouth', 'descri...
4      [{'id': 409, 'name': 'Powdered Sugar', 'descri...
                             ...                        
129    [{'id': 305, 'name': 'Light Rum', 'description...
130    [{'id': 137, 'name': 'Coffee Brandy', 'descrip...
131    [{'id': 250, 'name': 'Grenadine', 'description...
132    [{'id': 2, 'name': 'Gin', 'description': 'Gin ...
133    [{'id': 179, 'name': 'Dark Rum', 'description'...
Name: ingredients, Length: 134, dtype: object

In [81]:
#Now we try to unpack 'ingredients' column as it seems to be a json format

df.loc[:, 'ingredients'] = df.loc[:, 'ingredients'].apply(lambda a: pd.DataFrame(a))

df.loc[:, 'ingredients']

0          id        name                            ...
1          id               name                     ...
2          id       name                             ...
3          id            name                        ...
4          id             name                       ...
                             ...                        
129        id       name                             ...
130        id           name                         ...
131        id             name                       ...
132        id          name                          ...
133        id         name                           ...
Name: ingredients, Length: 134, dtype: object

In [86]:
df.loc[10, 'ingredients']

Unnamed: 0,id,name,description,alcohol,type,percentage,imageUrl,createdAt,updatedAt,measure
0,32,Apricot Brandy,,1,Brandy,24.0,https://cocktails.solvro.pl/images/ingredients...,2024-08-18T19:00:53.000+00:00,2024-08-18T19:01:18.000+00:00,1 oz
1,312,Lime,"A lime (from French lime, from Arabic līma, fr...",0,Fruit,,https://cocktails.solvro.pl/images/ingredients...,2024-08-18T19:02:40.000+00:00,2024-08-18T19:02:40.000+00:00,1
2,498,Triple Sec,"Triple sec, originally Curaçao triple sec, is ...",1,Liqueur,,https://cocktails.solvro.pl/images/ingredients...,2024-08-18T19:03:40.000+00:00,2024-08-18T19:03:40.000+00:00,1 oz


# At this point we need to start over because of the found records...