<a href="https://colab.research.google.com/github/Fitzpatrique/15-wine-tasting/blob/master/Wine_Tasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount your Google drive folder on Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import pandas as pd
import nltk
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
#from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perform cosine similarity
from nltk import word_tokenize #  to create tokens
#from nltk.corpus import stopwords # for stop words

In [3]:
def text_normalization(text):
    text = str(text).lower() # text to lower case
    spl_char_text = re.sub(r'[^a-z]',' ',text) # removing special characters
    tokens = nltk.word_tokenize(spl_char_text) # word tokenizing
    lema = wordnet.WordNetLemmatizer() # initializing lemmatization
    tags_list = pos_tag(tokens, tagset = None) # parts of speech
    lema_words = [] # empty list
    
    for token,pos_token in tags_list:
        if pos_token.startswith('V'): # verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # adverb
            pos_val = 'r'
        else:
            pos_val = 'n' # noun
            
        lema_token = lema.lemmatize(token, pos_val) # performing lemmmatization
        lema_words.append(lema_token) # appending lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
wt_df = pd.read_csv('/content/gdrive/My Drive/wine_tasting_ds(3).csv')

In [8]:
wt_df.head()

Unnamed: 0.1,Unnamed: 0,description,variety,winery,Location,lemmatized_text
0,0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon,Heitz,Napa California Valley US,this tremendous varietal wine hail from oakvil...
1,1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro,Bodega Carmen Rodríguez,Northern Toro Spain Spain,ripe aroma of fig blackberry and cassis be sof...
2,2,Mac Watson honors the memory of a wine once ma...,Sauvignon Blanc,Macauley,California Sonoma Knights Valley US,mac watson honor the memory of a wine once mak...
3,3,"This spent 20 months in 30% new French oak, an...",Pinot Noir,Ponzi,Willamette Valley Oregon US,this spent month in new french oak and incorpo...
4,4,"This is the top wine from La Bégude, named aft...",Provence red blend,Domaine de la Bégude,Provence Bandol France,this be the top wine from la b gude name after...


In [9]:
wt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410744 entries, 0 to 410743
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       410744 non-null  int64 
 1   description      410744 non-null  object
 2   variety          410744 non-null  object
 3   winery           410744 non-null  object
 4   Location         410744 non-null  object
 5   lemmatized_text  410744 non-null  object
dtypes: int64(1), object(5)
memory usage: 18.8+ MB


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [12]:
def string(text):
  return str(text)

In [13]:
wt_df['lemmatized_text'] = wt_df['lemmatized_text'].apply(string)

In [14]:
wt_df['lemmatized_text'] = wt_df['lemmatized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [15]:
wt_df.head()

Unnamed: 0.1,Unnamed: 0,description,variety,winery,Location,lemmatized_text
0,0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon,Heitz,Napa California Valley US,tremendous varietal wine hail oakville age thr...
1,1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro,Bodega Carmen Rodríguez,Northern Toro Spain Spain,ripe aroma fig blackberry cassis soften sweete...
2,2,Mac Watson honors the memory of a wine once ma...,Sauvignon Blanc,Macauley,California Sonoma Knights Valley US,mac watson honor memory wine make mother treme...
3,3,"This spent 20 months in 30% new French oak, an...",Pinot Noir,Ponzi,Willamette Valley Oregon US,spent month new french oak incorporate fruit p...
4,4,"This is the top wine from La Bégude, named aft...",Provence red blend,Domaine de la Bégude,Provence Bandol France,top wine la b gude name high point vineyard fo...


In [16]:
# using tf-idf

tfidf = TfidfVectorizer() # initializing tf-idf
x_tfidf = tfidf.fit_transform(wt_df['lemmatized_text'][:25000]).toarray() # transforming the data to array

In [17]:
df_tfidf = pd.DataFrame(x_tfidf, columns=tfidf.get_feature_names())
#df_tfidf.head(5)

In [18]:
def description(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    tf = tfidf.transform([lemma]).toarray() # applying tf-idf
    cos = 1 - pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosing similarity
    index_value = cos.argmax() # getting the index value
    print("Variety: ", wt_df['variety'].loc[index_value])
    print("Winery: ", wt_df['winery'].loc[index_value])
    print("Location: ", wt_df['Location'].loc[index_value])

In [19]:
def run():
    while True:
        text = str(input("Enter the 'q' key to exit description: "))
        if text == 'q':
            print("Bye!")
            return False
        else:
            print(description(text))
            return run()

In [20]:
run()

Enter the 'q' key to exit description: ripe aroma fig blackberry cassis soften
Variety:  Tinta de Toro
Winery:  Bodega Carmen Rodríguez
Location:  Northern Toro Spain Spain
None
Enter the 'q' key to exit description: q
Bye!


False