### Necessary packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel
import pickle

## Dataset

In [3]:
cocktails_df = pd.read_csv('final_cocktails.csv')
cocktails_df.shape

(1130, 5)

In [4]:
cocktails_df

Unnamed: 0,Cocktail,Ingredients,Preparation,Bar/Company,Location
0,'57 Chevy with a White License Plate,"Creme De Cacao White, Vodka",1. Fill a rocks glass with ice 2.add white cre...,,
1,155 Belmont,"Light Rum, Orange Juice, Vodka, Dark Rum",Blend with ice. Serve in a wine glass. Garnish...,,
2,9 1/2 Weeks,"Orange Curacao, Strawberry Liqueur, Orange Jui...",Combine all ingredients in glass mixer. Chill ...,,
3,A Gilligan's Island,"Peach Schnapps, Cranberry Juice, Orange Juice,...","Shaken, not stirred!",,
4,A True Amaretto Sour,"Lemon, Maraschino Cherry, Amaretto",Rub the rim of an old fashioned glass with lem...,,
...,...,...,...,...,...
1125,Zima Blaster,"Chambord raspberry liqueur, Zima,","Fill glass with ice. Pour in Chambord, then fi...",,
1126,Zizi Coin-coin,"Lemon, Ice, Lemon juice, Cointreau,","Pour 5cl of Cointreau on ice, add 2cl of fresh...",,
1127,Zimadori Zinger,"Zima, Midori melon liqueur,",Pour Zima in a collins glass over ice and then...,,
1128,Ziemes Martini Apfelsaft,"Apple juice, Vermouth,",Serve without ice. At least the juice shold ha...,,


### Preprocessing

Check if we have any duplicates

In [5]:
cocktails_df['Cocktail'].describe()

count                                     1130
unique                                    1130
top       '57 Chevy with a White License Plate
freq                                         1
Name: Cocktail, dtype: object

Fill missing values with blanks

In [6]:
cocktails_df.fillna('', inplace=True)

Adding some measures and words we failed to delete to stop words

In [7]:
additional_stop_words = frozenset(['oz', 'simple'])

cocktail_stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

### Vectorizing text data - TFIDF

In [8]:
vectorizer = TfidfVectorizer(stop_words=cocktail_stop_words, token_pattern=r'\b[^\d\W][^\d\W]+\b')

tfidf_matrix = vectorizer.fit_transform(cocktails_df['Ingredients'])

In [9]:
cocktail_feature_df = pd.DataFrame(tfidf_matrix.toarray() ,columns=vectorizer.get_feature_names_out(), index=cocktails_df['Cocktail'])

### Calculating similarities

In [10]:
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
similarity_df = pd.DataFrame(similarity_matrix, columns=cocktail_feature_df.index, index=cocktail_feature_df.index)

In [12]:
similarity_df

Cocktail,'57 Chevy with a White License Plate,155 Belmont,9 1/2 Weeks,A Gilligan's Island,A True Amaretto Sour,A.D.M. (After Dinner Mint),A1,Abbey Martini,Absolut Summertime,Absolutely Fabulous,...,Zoksel,Zombie,Zambeer,Zenmeister,Zipperhead,Zima Blaster,Zizi Coin-coin,Zimadori Zinger,Ziemes Martini Apfelsaft,Bloody Mary
Cocktail,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'57 Chevy with a White License Plate,1.000000,0.110941,0.000000,0.107361,0.000000,0.338618,0.000000,0.000000,0.000000,0.141239,...,0.140053,0.000000,0.000000,0.0,0.109736,0.000000,0.000000,0.000000,0.000000,0.060743
155 Belmont,0.110941,1.000000,0.161213,0.238136,0.000000,0.062879,0.033318,0.148142,0.000000,0.160987,...,0.000000,0.389660,0.000000,0.0,0.092073,0.000000,0.038509,0.000000,0.057749,0.087506
9 1/2 Weeks,0.000000,0.161213,1.000000,0.181296,0.000000,0.000000,0.026090,0.199644,0.390334,0.033265,...,0.000000,0.018410,0.000000,0.0,0.042990,0.039977,0.030154,0.038257,0.045220,0.028612
A Gilligan's Island,0.107361,0.238136,0.181296,1.000000,0.000000,0.060850,0.064486,0.183351,0.000000,0.467562,...,0.000000,0.045505,0.000000,0.0,0.089101,0.000000,0.074533,0.000000,0.111771,0.120043
A True Amaretto Sour,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.072616,0.000000,0.054844,0.000000,...,0.044733,0.000000,0.000000,0.0,0.000000,0.000000,0.167858,0.000000,0.000000,0.039819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zima Blaster,0.000000,0.000000,0.039977,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.627219,1.000000,0.000000,0.443746,0.000000,0.000000
Zizi Coin-coin,0.000000,0.038509,0.030154,0.074533,0.167858,0.000000,0.152675,0.047689,0.086269,0.049026,...,0.070364,0.027133,0.203998,0.0,0.000000,0.000000,1.000000,0.000000,0.066646,0.104804
Zimadori Zinger,0.000000,0.000000,0.038257,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.048327,0.443746,0.000000,1.000000,0.000000,0.000000
Ziemes Martini Apfelsaft,0.000000,0.057749,0.045220,0.111771,0.000000,0.000000,0.057663,0.301417,0.000000,0.073521,...,0.000000,0.040690,0.000000,0.0,0.000000,0.000000,0.066646,0.000000,1.000000,0.063238


In [13]:
similarity_df.to_pickle("similarity_df.pkl")
cocktails_df.to_pickle("cocktails_df.pkl")

### Recommender Function

In [14]:
def cocktail_recommender(cocktail_name, num_recommendations=11, similarity_df=similarity_df, cocktails_df=cocktails_df):

  recommendations = similarity_df[cocktail_name].sort_values(ascending=False)[1:num_recommendations]
  recommendations.name = 'Similarity'

  cocktails_details = cocktails_df[cocktails_df['Cocktail'].isin(recommendations.index)].set_index('Cocktail')

  recommendations_df = pd.concat([cocktails_details,recommendations], axis=1).sort_values(by='Similarity', ascending=False)

  return recommendations_df

In [15]:
# settings to display all columns
pd.options.display.max_colwidth = 80
pd.set_option("display.max_columns", None)

#### Enter reference drink

In [16]:
reference = 'Pina Colada'
mycocktail = cocktail_recommender(reference)
cocktails_df.loc[cocktails_df['Cocktail'] == reference]

Unnamed: 0,Cocktail,Ingredients,Preparation,Bar/Company,Location
999,Pina Colada,"Pineapple, Coconut milk, Light rum,","Mix with crushed ice in blender until smooth. Pour into chilled glass, garni...",,


#### Output - First 10 corresponding drinks

In [17]:
cols = ['Ingredients', 'Similarity']
mycocktail[cols]

Unnamed: 0_level_0,Ingredients,Similarity
Cocktail,Unnamed: 1_level_1,Unnamed: 2_level_1
Havana Cocktail,"Lemon juice, Pineapple juice, Light rum,",0.589425
Frozen Pineapple Daiquiri,"Sugar, Lime juice, Pineapple, Light rum,",0.575486
Jamaica Kiss,"Milk, Ice, Light rum, Coffee liqueur,",0.525381
151 Florida Bushwacker,"Vanilla ice-cream, Coconut liqueur, Milk, Cointreau, Dark Creme de Cacao, ru...",0.510372
Pink Colada,"Pink Pigeon Rum, Pineapple Juice, Cream of Coconut",0.509014
Rum Milk Punch,"Nutmeg, Powdered sugar, Milk, Light rum,",0.474265
Rum Screwdriver,"Orange juice, Light rum,",0.472905
Coco Batida,"1 Coconut milk, 1 Condensed milk, Ypiòca, 1 Tbsp Sugar",0.449825
Arizona Twister,"Pineapple, Orange Juice, Cream Of Coconut, Grenadine, Malibu Rum, Pineapple ...",0.445534
Cherry Rum,"Light cream, Cherry brandy, Light rum,",0.437334


In [103]:
cocktails_df['Ingredients'] = [x.strip(', ').split(',') for x in cocktails_df['Ingredients']]
cocktails_df['Ingredients']

0                                                        [Creme De Cacao White,  Vodka]
1                                         [Light Rum,  Orange Juice,  Vodka,  Dark Rum]
2                 [Orange Curacao,  Strawberry Liqueur,  Orange Juice,  Absolut Citron]
3                             [Peach Schnapps,  Cranberry Juice,  Orange Juice,  Vodka]
4                                                [Lemon,  Maraschino Cherry,  Amaretto]
                                             ...                                       
1125                                                [Chambord raspberry liqueur,  Zima]
1126                                            [Lemon,  Ice,  Lemon juice,  Cointreau]
1127                                                      [Zima,  Midori melon liqueur]
1128                                                           [Apple juice,  Vermouth]
1129    [Lime,  Tabasco sauce,  Worcestershire sauce,  Lemon juice,  Tomato juice,  ...
Name: Ingredients, Length: 1130,