In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
pd.set_option("display.max_colwidth", 510)
pd.set_option("display.max_columns",500)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
from nltk.stem.snowball import EnglishStemmer

_______________________________

# Cocktail recommender:  
User inputs a description of what kind of cocktail she/he likes and the algorithme returns suggestions based on KNN.

## Import data

Clean data, create a working dataframe and a retriever dataframe with all the cocktail information.

In [None]:
df_ctl_info = pd.read_csv('./CSVs/cocktail_info.csv', sep=',')

In [None]:
df_ctl_info.head()

In [None]:
df_ctl_info = df_ctl_info[df_ctl_info.Recipe != "Woops... We couldn't retrieve the exact recipe... It's trial & error time! Just a little more fun before enjoying a nice drink!"]

In [None]:
len(df_ctl_info)

In [None]:
df_ctl_info.drop_duplicates('Name', inplace=True)

In [None]:
len(df_ctl_info)

In [None]:
df_ctl_info

In [None]:
df_ctl_info.reset_index(inplace=True)

In [None]:
df_ctl_info.drop(columns='index', inplace=True)

In [None]:
return_info_df = df_ctl_info

In [None]:
return_info_df

### df_ctl_info ready for call-back of cocktail info after recommendation  
___________________________________________________________________  
## Fusing strings from 'Description', 'Recipe' & 'Ingredient' for vectorization and tokenizing

Create working dataframe.

In [None]:
df_ctl_dropnanrecipe = df_ctl_info

In [None]:
df_ctl_dropnanrecipe

Clean all the strings in concerned columns using regex

In [None]:
df_ctl_dropnanrecipe.iloc[:,1] # = 'Description'
df_ctl_dropnanrecipe.iloc[0,2] # = 'Recipe'
df_ctl_dropnanrecipe.iloc[0,3] # = 'Ingredients'

In [None]:
def use_regex(input_text):
    pattern = re.sub(r'\b\d+\b|\d+\.\d+|[^\w\s]|oz|ml', '', input_text).lower()
    return pattern

In [None]:
df_ctl_dropnanrecipe.iloc[:,1] = df_ctl_dropnanrecipe.iloc[:,1].apply(lambda x: use_regex(x))

In [None]:
df_ctl_dropnanrecipe.iloc[:,2] = df_ctl_dropnanrecipe.iloc[:,2].apply(lambda x: use_regex(x))

In [None]:
df_ctl_dropnanrecipe.iloc[:,3] = df_ctl_dropnanrecipe.iloc[:,3].apply(lambda x: use_regex(x))

In [None]:
df_ctl_dropnanrecipe.head()

Replace "unfortunately we have no description for this drink youll have to describe it yourself" with ""  (empty string)

In [None]:
df_ctl_dropnanrecipe.iloc[:,1] = df_ctl_dropnanrecipe.iloc[:,1].replace("unfortunately we have no description for this drink youll have to describe it yourself", "")

In [None]:
df_ctl_dropnanrecipe.drop(columns=['Nutrition Facts','Video Link'], inplace=True)

In [None]:
df_ctl_dropnanrecipe.head()

### Fuse the string from the three last columns

In [None]:
df_ctl_dropnanrecipe['text'] = df_ctl_dropnanrecipe['Description'] + ' ' + df_ctl_dropnanrecipe['Recipe'] + ' ' + df_ctl_dropnanrecipe['Ingredients']

In [None]:
df_ctl_text = df_ctl_dropnanrecipe[['Name', 'text']]

In [None]:
df_ctl_text

In [None]:
type(df_ctl_text.iloc[0,1])

## Vectorize text and input into matrix

In [None]:
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [None]:
vectorizer = CountVectorizer(stop_words='english')

Extract list of string from df_ctl_text['text'] and clean

In [None]:
text = df_ctl_text['text'].tolist()

In [None]:
len(text)

In [None]:
clean_text = []
for x in text:
    clean_str = re.sub(r'\s+', ' ', x)
    clean_str = re.sub(r'\d+\w*', '', x)
    clean_text.append(clean_str)

In [None]:
len(clean_text)

In [None]:
clean_text

Vectorize

In [None]:
vectors = vectorizer.fit_transform(clean_text).todense()

In [None]:
vocabulary = vectorizer.get_feature_names_out()

In [None]:
index = df_ctl_text['Name'].tolist()

In [None]:
word_matrix = pd.DataFrame(vectors, columns=vocabulary, index=index)

In [None]:
word_matrix

___________________________________________

## Train the model

In [None]:
X = word_matrix.reset_index()
X.head()

In [None]:
X.drop(columns='index', inplace=True)

In [None]:
len(X)

In [None]:
y = word_matrix.reset_index()
y.rename(columns={'index':'name'}, inplace=True)

In [None]:
y = y['name']
y

## Split train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
len(X_train)

## Train model

In [None]:
model = NearestNeighbors(n_neighbors=5)

In [None]:
model.fit(X_train.values, y_train.values)

_______________________________

## User input preprocessing

In [None]:
return_info_df #why is the return_info_df modified since it's last definition ??

In [None]:
def preprocess_input(input):
    input = input.lower()
    input = re.sub("[0-9]", "",input)
    input = [input]
    input = vectorizer.transform(input)
    return input

In [None]:
user_input = input('Please enter an ingredient')
print(user_input)
input_vector = preprocess_input(user_input)
print(input_vector)

In [None]:
input_vector

## Finding similar ingredients

In [None]:
return_info_df

In [None]:
distances, indices = model.kneighbors(input_vector, n_neighbors=5)

In [None]:
similar_cocktails = y_train.iloc[indices[0]].values

In [None]:
similar_cocktails

In [None]:
recommended_ing = []
print("Recommended cocktails:")
for ingredient in similar_cocktails:
    print(ingredient)

## Full cocktail info

In [None]:
results_df = return_info_df.loc[return_info_df['Name'].isin(similar_cocktails)]
results_df

______________________________

## Testing the model & fine-tuning

In [None]:
parameters_KNN = {
    'algorithm': ('ball_tree', 'kd_tree', 'brute'),
    'leaf_size': (20,30,40),
    'radius': (0.75,0.9,1)
    }

In [None]:
search_cv = GridSearchCV(model,
                         param_grid= parameters_KNN,
                         cv=3,
                         scoring='roc_auc')

In [None]:
# search_cv.fit(X_train, y_train)

In [None]:
# search_cv.best_params_

In [None]:
# search_cv.best_score_

__________________________________

## Final model

In [None]:
final_model = NearestNeighbors(n_neighbors=5, algorithm='auto' , leaf_size= 20 , radius= 1)

In [None]:
final_model.fit(X,y)

fresh summer drink white wine

In [None]:
# One cell algorithm

user_input = input('Please enter an ingredient')
print(user_input)
input_vector = preprocess_input(user_input)

distances, indices = final_model.kneighbors(input_vector)

similar_cocktails = y_train.iloc[indices[0]].values

recommended_ing = []
print("Recommended cocktails:")

results_df = return_info_df.loc[return_info_df['Name'].isin(similar_cocktails)]
results_df

* the return df, why is it stripped ?? make y = cocktail_id / index ?  
* why does the errore "UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
  warnings.warn(" now appear while it didn't before ??
* why does the recommender return cocktails with other ingredients than the one entered (rum, and returns NO cocktails with rum)

___________________________________

___________________________