# Import libraries

In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans

In [4]:
# import pymysql
# from sqlalchemy import create_engine
# import getpass 
# password = getpass.getpass()

In [5]:
# connection_string = 'mysql+pymysql://root:' + password + '@localhost/clean_winery'
# engine = create_engine(connection_string)

# Import dataframes

In [39]:
clustered_df = pd.read_csv('clustered.csv')

In [40]:
clustered_df.shape

(18389, 35)

In [41]:
all_data = pd.read_csv('all_data.csv')

In [42]:
all_data.shape

(21294, 34)

In [43]:
varietal_df = all_data[['wine_id','type','wine_name','country','region','varietal_name','winery','year','grapes_1_name','grapes_2_name','grapes_3_name']]

In [44]:
varietal_df.isna().sum()

wine_id             0
type                0
wine_name           0
country             0
region              0
varietal_name       0
winery              0
year                0
grapes_1_name    2829
grapes_2_name    2829
grapes_3_name    2829
dtype: int64

In [45]:
varietal_df = varietal_df.fillna('x')

In [46]:
wine_names = varietal_df[['wine_id','wine_name','winery']]

In [47]:
clustered_df = pd.merge(left = wine_names,
        right = clustered_df,
        how = 'inner', 
        left_on = "wine_id", 
        right_on= "wine_id")

In [48]:
varietal_df['varietal_name'] = np.where(varietal_df['varietal_name'].isin(['Grenache.']),'Garnacha',varietal_df['varietal_name'])
clustered_df['varietal_name'] = np.where(clustered_df['varietal_name'].isin(['Grenache.']),'Garnacha',clustered_df['varietal_name'])

# Copies to not be case sensitive

In [49]:
def case_folding(df,cols):
    df_folded = df.copy()
    for col in cols:
        df_folded[col] = [value.casefold() for value in df[col].values]
    return df_folded

In [50]:
cols = ['wine_name','winery','region','country','varietal_name','type','grapes_1_name','grapes_2_name','grapes_3_name']
clustered_folded = case_folding(clustered_df, cols)
varietal_folded = case_folding(varietal_df, cols)

# Tokenizing columns for later

In [51]:
def tokenizing_col(df,col):
    tokenized = []

    for row in df[col]:
        token = word_tokenize(row)
        tokenized.append(token)

    df['tokenized'] = tokenized
    
    return df

In [52]:
clustered_folded = tokenizing_col(clustered_folded,'wine_name')
varietal_folded = tokenizing_col(varietal_folded,'wine_name')

# Variety Recommender

In [86]:
def basic_recommender():
    text = input("what variety you like?   ").casefold()
    print(' ')

    if text in varietal_folded['varietal_name'].values:
        
        df = varietal_folded[varietal_folded['varietal_name'] == text]
                    
        recommended = random.choice(df['wine_id'].values)
        
        wine = varietal_df[varietal_df['wine_id']==recommended]['wine_name'].values
        region = varietal_df[varietal_df['wine_id']==recommended]['region'].values
        country = varietal_df[varietal_df['wine_id']==recommended]['country'].values
        
        print("If you like " + text + ", " + wine + ' from ' + region + ' in ' + country + ' is from the same variety.')
    else:
        print("Sorry I don't have any suggestions for that")
    return

In [87]:
basic_recommender()

what variety you like?   treixadura
 
['If you like treixadura, Antonio Montero Alejandrvs Barrica Treixadura  from Ribeiro in spain is from the same variety']


# Grape Recommender

In [74]:
def grape_recommender():
    text = input("What variety of grape do you like?   ").casefold()
    print(' ')

    if text in varietal_folded['grapes_1_name'].values:
        yn = 'y'
    elif text in varietal_folded['grapes_2_name'].values:
        yn = 'y'
    elif text in varietal_folded['grapes_3_name'].values:
        yn = 'y'
    else:
        yn = 'n'

    if yn == 'y':
        df = varietal_folded[(varietal_folded['grapes_1_name']==text)|(varietal_folded['grapes_2_name']==text)|(varietal_folded['grapes_3_name']==text)]
        
        recommended = random.choice(df['wine_id'].values)
        
        wine = varietal_df[varietal_df['wine_id']==recommended]['wine_name'].values
        region = varietal_df[varietal_df['wine_id']==recommended]['region'].values
        country = varietal_df[varietal_df['wine_id']==recommended]['country'].values

        print("Maybe you would like to try " + wine + ' from ' + region + ' in ' + country)
    else:
        print("Sorry I don't have any suggestions for that.")
    return

In [75]:
grape_recommender()

What variety of grape do you like?   pinot noir
 
['Maybe you would like to try Louis Jadot Griotte-Chambertin Grand Cru  from Griotte-Chambertin Grand Cru in france']


# Recommender by wine name

In [68]:
def name_recommender():
    text = input("Write a wine name and I will recommend a similar one:   ").casefold()
    print(' ')
    print('Checking my database...')
    print(' ')
    
    tokens = word_tokenize(text)
    
    bow_vect = CountVectorizer()
    bow_vect.fit(tokens)
    
    values = []
    for i in range(len(clustered_folded['tokenized'])):
        values.append(np.sum(bow_vect.transform(clustered_folded['tokenized'][i]).toarray()))
    
    indexes = []
    for i in range(len(values)):
        if values[i]==max(values):
            indexes.append(i)
    
    wine_name = clustered_df.loc[indexes[0]]['wine_name']
    
    confirm = input('Is ' + wine_name + ' the one you wrote?    Y/N    ').casefold()
    print(' ')
    
    i = 1
    while confirm == 'n':
        
        wine_name = clustered_df.loc[indexes[i]]['wine_name']
        confirm = input('Is ' + wine_name + ' the one you wrote?    Y/N    ').casefold()
        print(' ')
        
        if confirm == 'y':
            break
        
        i = i+1
        
        
        
    if confirm == 'y':
        
        cluster = clustered_folded.loc[indexes[i]]['clusters']
        df = clustered_df[clustered_df['clusters'] == cluster]
        
        recommended = random.choice(df['wine_id'].values)
        
        wine = df[df['wine_id']==recommended]['wine_name'].values
        region = df[df['wine_id']==recommended]['region'].values
        country = df[df['wine_id']==recommended]['country'].values
        
        print("My recommendation is " + wine + ' from ' + region + ' in ' + country)
    

    return

In [69]:
name_recommender()

Write a wine name and I will recommend a similar one:   paco garcia tempranillo blanco
 
Checking my database...
 
Is Paco Garcia Tempranillo Blanco  the one you wrote?    Y/N    y
 
['My recommendation is Vie di Romans Dolée  from Friuli Isonzo in italy']


# Recommender by features 

### (it has some problems, sometimes it doesn't work)

In [66]:
def wine_feature_recommender():
    red_white = input('Are you looking for a Red or White Wine?   R/W      ').casefold()
    print(' ')
    body = input('How do you like the body of your wine?  from 1-5 (Low-High)      ').casefold()
    print(' ')
    acidity = input('How do you like the acidity?   High(H) - Medium(M) - Low(L)    ').casefold()
    print(' ')
    
    if red_white == 'r':
        red_white = 'red'
    else:
        red_white = 'white'
        
        
    if acidity == 'h':
        acidity = 'High'
    elif acidity == 'm':
        acidity = 'Medium'
    else:
        acidity = 'Low'
    
    values_list = clustered_folded.loc[(clustered_folded['body']==int(body))&(clustered_folded['type']==red_white)&(clustered_folded['acidity_description']==acidity)]['wine_id'].values
    
    if len(values_list) > 0:
        recomendation = random.choice(values_list)
        wine = clustered_df[clustered_df['wine_id']==recomendation]['wine_name'].values
        region = clustered_df[clustered_df['wine_id']==recomendation]['region'].values
        country = clustered_df[clustered_df['wine_id']==recomendation]['country'].values

        return print('I recommend you to try:    '+ wine+ ' from '+region+ ' in '+country)
    else: 
        return print("Sorry, I don't have recomendations for those characteristics.")
    

In [67]:
wine_feature_recommender()

Are you looking for a Red or White Wine?   R/W      r
 
How do you like the body of your wine?  from 1-5 (Low-High)      4
 
How do you like the acidity?   High(H) - Medium(M) - Low(L)    h
 
['I recommend you to try:    Podere Sapaio Sapaio Bolgheri Volpolo  from Bolgheri in italy']


# Recommender with clustering inside

#### work in progress

In [88]:
def new_recommender():
    red_white = input('Red or White?   ').casefold()
    print(' ')
    country = input('Which country?    ').casefold()
    print(' ')
    
    df = clustered_folded[(clustered_folded['type']==red_white)&(clustered_folded['country']==country)].reset_index(drop=True)
    df_num = df.select_dtypes(np.number)
    df_num = df_num.drop(['wine_id','clusters','style_id'],axis=1)
    
    #fit and transform scaling numericals MinMax
    X_num = MinMaxScaler().fit_transform(df_num) 
    # convert to dataframe again
    X_num = pd.DataFrame(X_num,columns=df_num.columns).reset_index(drop=True) 
    
    #building clusters with default 12
    kmeans = KMeans(n_clusters=12)
    kmeans.fit(X_num)
    
    # Assigning the clusters:
    new_clusters = kmeans.predict(X_num)
    df['new_clusters'] = new_clusters
    
    # choosing one similar
    wine = input('One similar that you like?    ').casefold()
    print(' ')
    
    tokens = word_tokenize(wine)
    
    bow_vect = CountVectorizer()
    bow_vect.fit(tokens)
    
    values = []
    for i in range(len(df['tokenized'])):
        values.append(np.sum(bow_vect.transform(list(df['tokenized'])[i]).toarray()))

   
    indexes = []
    for i in range(len(values)):
        if values[i]==max(values):
            indexes.append(i)
    
    wine_name = df.loc[indexes[0]]['wine_name']
    
    confirm = input('Is ' + wine_name + ' the one you wrote?    Y/N    ').casefold()
    print(' ')

    
    i = 1
    while confirm == 'n':
        
        wine_name = df.loc[indexes[i]]['wine_name']
        confirm = input('Is ' + wine_name + ' the one you wrote?    Y/N    ').casefold()
        print(' ')
        
        if confirm == 'y':
            break
        
        i = i+1
        if i >len(indexes):
            print("Sorry, I didn't get the name right. Please, try again!")
            break
        
    
    if confirm == 'y':
        
        df2 = clustered_df[(clustered_folded['type']==red_white)&(clustered_folded['country']==country)].reset_index(drop=True)
        df2['new_clusters'] = new_clusters
        
        cluster = df2.loc[indexes[i]]['new_clusters']
        recommended = random.choice(df2[df2['new_clusters'] == cluster]['wine_name'].values)
        print('My recommendation is:    ' + recommended)

    return

In [89]:
new_recommender()

Red or White?   red
 
Which country?    spain
 
One similar that you like?    rioja
 
Is la rioja alta rioja gran reserva 904  the one you wrote?    Y/N    y
 


IndexError: list index out of range

# Recommender with more questions

#### work in progress

In [39]:
def new_recommender_2():
    red_white = input('Red or White?   ').casefold()
    print(' ')

    country = input('Which country?    ').casefold()
    print(' ')
    
    region = input('Which region?    ').casefold()
    print(' ')
    
    
    variety = input('Which variety?    ').casefold()
    print(' ')
    
    
    grape = input('Which grape?    ').casefold()
    print(' ')
    
    
    body = input('How do you like the body of your wine?  High(H) - Medium(M) - Low(L) - Very low (V)    ').casefold()
    print(' ')
    
    
    acidity = input('How do you like the acidity?   High(H) - Medium(M) - Low(L)    ').casefold()
    print(' ')
    
    
        
    if red_white in clustered_folded['type'].values:
        df = clustered_folded[clustered_folded['type']==red_white]
    else:
        df = clusterd_folded
      
    
    if country in df['country'].values:
        df = df[df['country']==country]
    else:
        pass
        
        
    if region in df['region'].values:
        df = df[df['region']==region]
    else:
        pass
    
    if variety in df['varietal_name'].values:
        df = df[df['varietal_name']==variety]
    else:
        pass
    
#     if grape in df['region'].values:
#         df = df[df['region']==region]
#     else:
#         pass

    if body in df['body_description'].values:
        df = df[df['body_description']==body]
    else:
        pass
    
    if acidity in df['acidity_description'].values:
        df = df[df['acidity_description']==acidity]
    else:
        pass
    


    df_num = df[['body','acidity_1','acidity_2','fizziness','intensity','sweetness','tannin','num_ratings','rate']]
    
    #fit and transform scaling numericals MinMax
    transformer = MinMaxScaler().fit(df_num) 
    X_num = transformer.transform(df_num)
    # convert to dataframe again
    X_num = pd.DataFrame(X_num,columns=df_num.columns).reset_index(drop=True) 
    
   
    # HOW MANY CLUSTERS IN RELATION WITH HOW MANY VARIABLES¿?
    
    #building clusters with default 12
    kmeans = KMeans(n_clusters=12)
    kmeans.fit(X_num)
    
    # Assigning the clusters:
    new_clusters = kmeans.predict(X_num)
    df['new_clusters'] = new_clusters
    
    
    
    #CHOOSING SIMILARITY or NOT?
    similar_wine = input('Tell me a wine you like:      ').casefold()
    similar_token = word_tokenize(similar_wine)
    print(' ')
#     different_wine = input("Tell me a wine you DON'T like:      ").casefold()
#     different_token = word_tokenized(different_wine)
#     print(' ')
    
    
    bow_vect = CountVectorizer()
    
    # For SIMILAR wine
    bow_vect.fit(similar_token)
    
    values = []
    for i in range(len(clustered_folded['tokenized'])):
        values.append(np.sum(bow_vect.transform(list(clustered_folded['tokenized'])[i]).toarray()))

   
    indexes = []
    for i in range(len(values)):
        if values[i]==max(values):
            indexes.append(i)
    
    wine_name = clustered_df.loc[indexes[0]]['wine_name']
    region_name = clustered_df.loc[indexes[0]]['region']
    
    confirm = input('Is ' + wine_name + ' from '+ region_name +' the one you wrote?    Y/N    ').casefold()
    print(' ')

    
    i = 1
    while confirm == 'n':
        
        wine_name = clustered_df.loc[indexes[i]]['wine_name']
        region_name = clustered_df.loc[indexes[i]]['region']
        
        confirm = input('Is ' + wine_name + ' from '+ region_name + ' the one you wrote?    Y/N    ').casefold()
        print(' ')
        
        if confirm == 'y':
            break
        
        i = i+1
        if i >len(indexes):
            print("Sorry, I didn't get the name right. Please, try again!")
            break
        
    
    if confirm == 'y':
        
        df2 = clustered_df.loc[indexes[i]]
        df2_num = pd.DataFrame(df2[['body','acidity_1','acidity_2','fizziness','intensity','sweetness','tannin','num_ratings','rate']]).T
        
        X2_num = transformer.transform(df2_num)
        # convert to dataframe again
        X2_num = pd.DataFrame(X2_num,columns=df2_num.columns).reset_index(drop=True) 
        display(X2_num)

        #predict cluster
        X2_cluster = kmeans.predict(X2_num)
        
        
        recommended = random.choice(clustered_df[df['new_clusters'] == X2_cluster]['wine_name'].values)
        region_from = clustered_df[clustered_df['wine_name']==recommended]['region']
        print('My recommendation is:    ' + recommended +' from ' + region_from)

    return

In [40]:
new_recommender_2()

Red or White?   white
 
Which country?    spain
 
Which region?    ribeiro
 
Which variety?    ribeiro
 
Which grape?    godello
 
How do you like the body of your wine?  High(H) - Medium(M) - Low(L) - Very low (V)    m
 
How do you like the acidity?   High(H) - Medium(M) - Low(L)    m
 


ValueError: n_samples=2 should be >= n_clusters=12.

In [34]:
pd.DataFrame(clustered_df.loc[3215][['body','acidity_1','acidity_2','fizziness','intensity','sweetness','tannin','num_ratings','rate']]).T

Unnamed: 0,body,acidity_1,acidity_2,fizziness,intensity,sweetness,tannin,num_ratings,rate
3215,4.0,3.0,3.17,0.0,4.28,1.84,3.13,1168,4.0


In [27]:
clustered_df.loc[3215]['region']

'Alentejano'

In [None]:
    var1 = ['red','white']
    red_white = input('Red or White?   ').casefold()
    print(' ')
    
    if red_white in var1:
        print(red_white)
    else:
        print('Sorry,')

I'm thinking about the questions I want to answer and the variables I can use, there are too many, but now I can use:
* region
* country
* red/white
* variety¿?
* grape
* clusters based on wine characteristics (by name?)
* features (body, acidity)
* something similar to or different from
