#  Helper functions for Wine tasting Notebook

#### author: A. Tomberg
#### date: 04/12/2019

In [None]:
import string
import collections

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import FreqDist

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import sklearn_recommender as skr
from sklearn import metrics
from pprint import pprint

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


from joblib import dump, load

In [None]:
def plot_frequency_of_occurence(df, col):
    
    s2=df[col]
    prob = s2.value_counts(normalize=True)
    threshold = 0.01
    mask = prob > threshold
    tail_prob = prob.loc[~mask].sum()
    prob = prob.loc[mask]
    prob['other'] = tail_prob
    prob = prob.to_frame()
    prob = prob.rename(columns={col: "frequency"}) 

    plt.figure(figsize=(10, 10))
    plt.title('Showing % of occurence in '+col)
    sns.barplot(y=prob.index, x='frequency', data = prob, palette="Reds_d");
    return

In [None]:
def parse_decription(descr_text):
    # split into words
    tokens = word_tokenize(descr_text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["drink", "now", "wine", "flavor", "flavors"])
    words = [w for w in words if not w in stop_words]

    #print(words[:100])

    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]

    #print(stemmed[:100])
    
    return stemmed

In [None]:
def plot_freq_words(df, col, how_many = 10, title = None, size = (10,10)):
    
    l = list(df[col])
    flat_list = [item for sublist in l for item in sublist]
    freq = FreqDist(flat_list)

    most_frequent_words = pd.DataFrame(freq.most_common(how_many), columns =['word', 'freq'])
    
    sns.set(style="whitegrid")
    plt.figure(figsize=size)
    plt.title(title)
    sns.barplot(x="freq", y="word", data=most_frequent_words, palette="Blues_d");

    return most_frequent_words

In [None]:
def rem_words(querywords):
    unwanted_words = ['wine']
    return [word for word in querywords if word not in unwanted_words]

#wine_data['parsed_descr']  = wine_data['parsed_descr'].apply(rem_words)

In [None]:
def reduce_dimentionality(X, n_components = 3):
    
    print("Performing dimensionality reduction using LSA")

    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))
    
    return X

In [None]:
def create_clusters_from_dataframe(df, number_of_clusters = 1):
    list_of_clusters = list()
    
    for i in range(number_of_clusters):
            #print(i)
            cluster = df.loc[df['cluster'] == i] 
            cluster = cluster.reset_index()
            cols = [col for col in cluster.columns if not(col.startswith('Unnamed'))]
            cluster=cluster[cols]
            cluster = cluster.rename(columns={"index": "original_index"})
                               
            list_of_clusters.append(cluster)

    print('I created '+ str(len(list_of_clusters)) + ' clusters.') 
    return list_of_clusters

In [None]:
def draw_word_cloud(text):

    wordcloud = WordCloud(max_words=50, background_color="white").generate(text)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    return

In [None]:
def get_index_from_title(search_title,df,clusters_list):
        
    try:
        idx_in_df = df[df.title.str.contains(search_title)].index.values[0]
        
    except:
        print('Cannot find this name in wine titles.')
        return (-1, -1, -1)
        
    which_cluster = df.loc[idx_in_df, 'cluster']
    c = clusters_list[which_cluster]
    
    idx =c[c.original_index == idx_in_df].index.values[0]
    #print(idx)
    
    return (idx_in_df, which_cluster, idx)

In [None]:
def plot_variety_in_cluster(cluster, n = 20, title = None):

    variety_counts = (cluster['variety'].value_counts()[:n])
    variety_counts = variety_counts.to_frame()
    variety_counts = variety_counts.rename(columns={"variety": "frequency"}) 
    #display(variety_counts)
    plt.figure(figsize=(10, 10))
    plt.title(title)
    sns.barplot(y=variety_counts.index, x='frequency', data = variety_counts, palette="Greens_d");
    
    return