## Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

In [None]:
wine_df=pd.read_csv('../input/wine-reviews/winemag-data-130k-v2.csv')
wine_df.drop(columns= ['Unnamed: 0']); # Dropping extra column with non important imformation

## Top 5 countries that produce most number of wines

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(wine_df['country'],order=wine_df.country.value_counts().iloc[:5].index)
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel('Number of wines produced')
plt.title("Top 5 countries with largest number of wines produced")
plt.show()


## U.S.A. makes more number of wines than all the next top 4 producers

## Let us find out which of all the countries make best wines based on the point 

In [None]:
# Making a function to get the average of the points of wines of a particular country
avg_country_score=pd.DataFrame(columns=['country','points'])
def points():
    score=[]
    for i in set(wine_df.country):
        score.append(wine_df[wine_df['country']==i]['points'].mean())
    return score
score=points()
avg_country_score['country']=list(set(wine_df.country))
avg_country_score['points']=score
avg_country_score.dropna(inplace=True)
avg_country_score=avg_country_score.sort_values('points',ascending=False)
avg_country_score.head()

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(data=avg_country_score.head(),x='country',y='points')
plt.xticks(rotation=90)
plt.xlabel("Country")
plt.ylabel("Average of points for all the wines")
plt.title("Top 5 countries with best testing wines")
plt.show()

## On average best testing wines can be found in England followed by India and austria

## Preprocessing

The process of converting data to something a computer can understand is referred to as pre-processing. One of the major forms of pre-processing is to filter out useless data. In natural language processing, useless words (data), are referred to as stop words.
### Stop-words
Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

We would not want these words to take up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to stop words. NLTK(Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages.

### Tokenizing 
Tokenization is the process by which big quantity of text is divided into smaller parts called tokens.
 These tokens are very useful for finding such patterns as well as is considered as a base step for stemming and lemmatization.
 
### Lemmatize
Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word.

In [None]:
# fill nan with empty spaces
wine_df['description']=wine_df['description'].fillna('')

In [None]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x
rem_stopwords_tokenize(wine_df,'description')

In [None]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr=data[name]
    a=[]
    for i in arr:
        b=[]
        for j in i:
            x=lemmatizer.lemmatize(j,pos='a')
            x=lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name]=a
lemmatize_all(wine_df,'description')

## After all the pre-processing(removing stop words, tokenizing, lammetizing) our data looks something like this

In [None]:
wine_df.sample(5)

In [None]:
# saving this transformed dataframe to output
wine_df.to_csv("preprocessed_wine_df.csv")

## Recommendation top 5 wines based on the current preferences

In [None]:
# function for matching input from user to the best title from the wine_df
from fuzzywuzzy import process
def get_matching_name(input_user, df):
    str2match = input_user
    str_options = df.title.to_list()
    highest = process.extractOne( str2match, str_options )
    return highest[0]

In [None]:
def set_rec_finder_by_popularity(na,df, number = 5):
    def intersection(lst1, lst2): 
        return list(set(lst1) & set(lst2)) 
    def find_rec(name):
        x=list(df[df['title']==name]['description'].values)[0]
        score=[]
        for i in range(len(df)):
            score.append([len(intersection(x,df['description'].values[i])),df['title'].values[i]])
        return score
    
    na = get_matching_name(na, df)
    print(f"User choice matches {na} from the database \n")
    recommendations=find_rec(na)
    recommendations.sort(reverse=True)
    recommendations=np.array(recommendations)
    ans=recommendations[:number,1]
    ans2=[]
    for i in ans:
        ans2.append([df[df['title']==i]['points'].values[0],i])
    ans2.sort(reverse=True)
    ans2=np.array(ans2)
    ans2=ans2[:10,1]
    print(f"Recommended top {number} wines with similar tastes are:- ")
    for i in ans2:
        print(i)

In [None]:
x=set_rec_finder_by_popularity("Vintner's Reserve Wild",wine_df, number = 5)