# Import libraries

In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', None)

In [80]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import pymysql
from sqlalchemy import create_engine
import getpass 
password = getpass.getpass()

········


In [3]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/clean_winery'
engine = create_engine(connection_string)

# Import dataframes

In [179]:
query = '''SELECT c.wine_id, w.wine_name, w.winery, c.region, c.country, c.varietal_name, c.type, c.year, c.style_id, c.body, c.acidity_1, c.acidity_description, c.acidity_2, c.fizziness, c.intensity, c.sweetness, c.tannin, c.clusters, p.vol_ml, p.price_eur, c.num_ratings, c.rate FROM clustered c
LEFT JOIN wine w USING (wine_id)
LEFT JOIN purchase p USING (wine_id)
LEFT JOIN style s USING (wine_id)'''
clustered_df = pd.read_sql(query,engine)

In [151]:
clustered_df.shape

(18285, 22)

In [152]:
query = '''SELECT wine_id, wine_name, winery, region, country, varietal_name, type, year, vol_ml, price_eur, num_ratings, rate FROM  all_data'''
varietal_df = pd.read_sql(query,engine)

In [153]:
varietal_df.shape

(21121, 12)

# Copies to not be case sensitive

In [26]:
def case_folding(df,cols):
    df_folded = df.copy()
    for col in cols:
        df_folded[col] = [value.casefold() for value in df[col].values]
    return df_folded

In [182]:
cols = ['wine_name','winery','region','country','varietal_name','type']
clustered_folded = case_folding(clustered_df, cols)
varietal_folded = case_folding(varietal_df, cols)

# Checking first steps of my recommender

In [73]:
def basic_recommender():
    text = input("what variety you like?   ").casefold()

    if text in varietal_folded['varietal_name'].values:
        recommended = random.choice(varietal_df[varietal_folded['varietal_name'] == text]['wine_name'].values)
        print("This wine is from the same variety:  " + recommended)
    else:
        print("Sorry I don't have any suggestions for that")
    return

In [76]:
basic_recommender()

what variety you like?   cote beaune
This wine is from the same variety:  Bonneau du Martray Corton-Charlemagne Grand Cru 


# Recommender by wine name

In [100]:
tokenized = []

for row in clustered_folded['wine_name']:
    token = word_tokenize(row)
    tokenized.append(token)
    
clustered_folded['tokenized'] = tokenized

In [139]:
def name_recommender():
    text = input("Write a wine name and I will recommend a similar one:   ").casefold()
    print(' ')
    print('Checking my database...')
    print(' ')
    
    tokens = word_tokenize(text)
    
    bow_vect = CountVectorizer()
    bow_vect.fit(tokens)
    
    values = []
    for i in range(len(clustered_folded['tokenized'])):
        values.append(np.sum(bow_vect.transform(clustered_folded['tokenized'][i]).toarray()))

    name = max(values)
    index = values.index(name)
    
    wine_name = clustered_df.loc[index]['wine_name']
    
    confirm = input('Is ' + wine_name + ' the one you wrote?    Y/N    ').casefold()
    print(' ')

    
    if confirm == 'y':
        cluster = clustered_folded.loc[index]['clusters']
        recommended = random.choice(clustered_df[clustered_df['clusters'] == cluster]['wine_name'].values)
        print('My recommendation is:    ' + recommended)
    
    else:
        print("Sorry, I didn't get the name right, please, try again!")
    return

In [141]:
name_recommender()

Write a wine name and I will recommend a similar one:   domaine ponsot morey saint
 
Checking my database...
 
Is Domaine Ponsot Morey-Saint-Denis Premier Cru Cuvée des Alouettes  the one you wrote?    Y/N    y
 
My recommendation is:    Domaine de La Romanée-Conti Romanée-Conti Grand Cru 


# Recommender by selection red_white

In [77]:
def red_white_recommender():
    type = input('What type of wine would you prefer: R = Red, W = White, A = Any?    ').casefold()
    
    text = input('Would you like to choose a country?  Y/N   ').casefold()
    
    country = input('Which country do you want your wine from?   ').casefold()

# Recommender by multiple choices

In [71]:
def my_recommender():
    option1 = input('What type of wine would you prefer: R = Red, W = White, A = Any?    ').casefold()
    
    if option1 == 'r':
        option2 = input('Would you like to choose the variety? Y/N     ').casefold()
        
        if option2 == 'n':
            option3 = input('Would you like to choose a country?:   ').casefold()
            
            
            if option3 in clustered_folded['region'].values:
                random_cluster = random.choice(clustered_df[clustered_folded['region'] == option3]['clusters'].values)
                print("This wine will be similar " + random.choice(clustered_df[clustered_df['clusters']==random_cluster]['wine_name'].values))
            
            
            elif option3 in clustered_folded['wine_name'].values:
                print('its a region')
            else:
                print('Sorry')
                
#                 clustered_df[clustered_folded['varietal_name'] == text]['wine_name'].values
#                 print("This wine will be similar " + random.choice())
        else:
            option3 = input('Variety you like:   ').casefold()
            
            if option3 in varietal_folded['varietal_name'].values:
                recommeded = random.choice(varietal_df[varietal_folded['varietal_name'] == option3]['wine_name'].values)
                print("This wine is from the same variety:  " + recommeded)
                
        
    elif option1 == 'w':
        print('White')
        
    elif option1 == 'a':
        print('All')
        
    else:
        option1 = input("Sorry, we don't have that option, try with: R = Red, W = White, A = Any     ")
        
    return

In [72]:
my_recommender()

What type of wine would you prefer: R = Red, W = White, A = Any?    r
Varietal name? Y/N     y
Variety you like:   tempranillo
This wine is from the same variety:  Artadi Pagos Viejos 


# Recommender by features 

### (it has some problems, sometimes it doesn't work)

In [201]:
def wine_feature_recommender():
    red_white = input('Are you looking for a Red or White Wine?   R/W      ').casefold()
    body = input('How do you like the body of your wine?  from 1-5 (Low-High)      ').casefold()
    acidity = input('How do you like the acidity?   High(H) - Medium(M) - Low(L)    ').casefold()
    
    if red_white == 'r':
        red_white = 'red'
    else:
        red_white = 'white'
        
        
    if acidity == 'h':
        acidity = 'High'
    elif acidity == 'm':
        acidity = 'Medium'
    else:
        acidity = 'Low'
    
    
    recomendation = random.choice(clustered_df.loc[(clustered_folded['body']==int(body))&(clustered_folded['type']==red_white)&(clustered_folded['acidity_description']==acidity)]['wine_name'].values)
    print('I recommend you to try:    '+ recomendation)
    return

In [207]:
wine_feature_recommender()

Are you looking for a Red or White Wine?   R/W      w
How do you like the body of your wine?  from 1-5 (Low-High)      3
How do you like the acidity?   High(H) - Medium(M) - Low(L)    m
I recommend you to try:    Bodegas Etchart Gran Linaje Torrontes 
