# NMF Topic Modeling With Dot Product Similarity Reccomender 


In [1]:
import pandas as pd 
import streamlit as st 
import altair as alt
from PIL import Image 

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
import warnings
warnings.filterwarnings("ignore")

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [3]:
# read in wine data
# drop duplicates and fill na with zeros 
WINE = pd.read_csv('WINENLP.csv')
WINE.drop_duplicates(subset=['description'],inplace=True)
WINE.fillna(0,inplace=True)
shp = WINE.shape
shp[0]
WINES = WINE[['description','country','province','variety','price','title']]
WINES.reset_index(inplace=True,drop=True)



# Augmented Corpus
The augmented corpus adds an additional sentence to each document in the data set

It adds country or origin, province, and variety(or wine type)

In [4]:
def augmented_corpus(WINE,shp):
    TEXT = [[WINE['country'][i] for i in range(shp)],[WINE['province'][i] for i in range(shp)],
            [WINE['variety'][i] for i in range(shp)]]
   
    # concatenate features into sentance to add to descriptions
    TO_ADD = ["This Wine is from " + str(TEXT[0][j]) + " in " + str(TEXT[1][j]) + 
          " and is a " + str(TEXT[2][j]) for j in range(shp)]
   

    DESCRIPTIONS_PLUS = [WINE['description'][i] + " " + TO_ADD[i] for i in range(shp)]
   
    return DESCRIPTIONS_PLUS

DP = augmented_corpus(WINES,shp[0])
WINES["PlusDescriptions"] = DP


In [5]:
# define corpus'
CORPUS1 = WINES['description']
CORPUS2 = WINES['PlusDescriptions']


In [6]:
# augment english stop words list 
STOP = stopwords.words('english')
STOP += ['notes','wine','drink','region','nan','nose',
        'like','made','shows','parts','style','followed','amounts','mouth','flavors']


In [7]:
# instantiate Tfid vectorizer
TF_IDF = TfidfVectorizer(stop_words=STOP,token_pattern = r'(?u)\b[A-Za-z]+\b')

# fit transform returns Sparse Matrix 
Description_Matrix = TF_IDF.fit_transform(CORPUS1)

# words used
words = TF_IDF.get_feature_names()

Description_Matrix


<119955x29722 sparse matrix of type '<class 'numpy.float64'>'
	with 2693261 stored elements in Compressed Sparse Row format>

# Fitting Model
Fit NMF model on 30 components 

In [8]:
def fit(df):
    num_components = int(input("How many topics: "))
    # instantiating Non Negative Matrix Factorization Model
    nmf = NMF(n_components=num_components,random_state=84)
    # fitting model 
    nmf.fit(df)
    return nmf , num_components
nmf = fit(Description_Matrix)

How many topics: 30


In [9]:
Topic_Matrix = nmf[0].transform(Description_Matrix)
Topic_Matrix_df = pd.DataFrame(Topic_Matrix).add_prefix('topic_')
Topics = pd.concat([Topic_Matrix_df, WINES], axis=1)
Topics.dropna(inplace=True)
Topics.reset_index(inplace=True,drop=True)
Topic_Matrix_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,0.000000,0.007588,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016464,...,0.017341,0.000000,0.000000,0.000000,0.030234,0.000741,0.000000,0.000000,0.036795,0.000000
1,0.000000,0.005292,0.016141,0.000000,0.025425,0.000000,0.006370,0.016050,0.000000,0.000000,...,0.000000,0.000687,0.000000,0.000461,0.000000,0.000000,0.023647,0.000000,0.000000,0.026883
2,0.000541,0.021572,0.000000,0.000651,0.000404,0.000000,0.000000,0.000000,0.000000,0.000148,...,0.012777,0.000000,0.000444,0.000000,0.000000,0.000000,0.000000,0.000000,0.000902,0.000000
3,0.000000,0.000000,0.000826,0.010464,0.000000,0.000000,0.013849,0.000000,0.000000,0.000482,...,0.000000,0.000000,0.000000,0.000000,0.003944,0.000000,0.001424,0.000000,0.004497,0.000000
4,0.000788,0.000000,0.000369,0.000638,0.000000,0.000000,0.018648,0.002860,0.000000,0.001143,...,0.000000,0.000000,0.001496,0.000000,0.000000,0.010676,0.000000,0.003485,0.000000,0.000015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119950,0.000393,0.007485,0.000000,0.003465,0.000760,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.001220,0.022693,0.000000,0.000000,0.000491,0.001548,0.000000,0.000000,0.001215
119951,0.002779,0.000000,0.000000,0.000000,0.000289,0.000146,0.001316,0.015552,0.000000,0.011508,...,0.000289,0.000012,0.000000,0.020292,0.001577,0.000000,0.000000,0.000344,0.000000,0.000000
119952,0.000000,0.010008,0.001342,0.000000,0.000000,0.000000,0.000000,0.033352,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000733,0.024423,0.016572,0.000000,0.000000,0.028868
119953,0.006801,0.024488,0.006456,0.000000,0.000000,0.000000,0.000000,0.051643,0.000000,0.000000,...,0.027876,0.000000,0.000000,0.000000,0.000000,0.031944,0.000000,0.000000,0.000000,0.000000


# Building out Cosine Similarity Matrix

Only using 20,000 wines so the cosine array is easier to handle 

In [10]:

from sklearn.metrics.pairwise import cosine_similarity
Topic_Matrix_Partial = Topic_Matrix[:20000]
#Topic_Matrix_Partial

In [15]:
# Passing topic matrix through the built in cosine similarity function
# can also do in numpy 
Cosine_Matrix = cosine_similarity(Topic_Matrix_Partial)
#import pickle
#with open('cosine_sim_array.pkl', 'wb') as f:
    #pickle.dump(Cosine_Matrix, f)
    

Cosine_Similarity_Df = pd.DataFrame(Cosine_Matrix)
Cosine_Similarity_Df
#Cosine_Similarity_Df.to_csv('SimMatrix.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,1.000000,0.012821,0.231142,0.117084,0.023589,0.119506,0.222300,0.079632,0.076757,0.204590,...,0.029184,0.008740,0.002555,0.148130,0.071443,0.000596,0.040428,0.033426,0.119826,0.036930
1,0.012821,1.000000,0.063273,0.050087,0.139145,0.082101,0.291190,0.183069,0.232164,0.149727,...,0.077203,0.370867,0.148988,0.382695,0.586198,0.130601,0.094801,0.023999,0.154815,0.088765
2,0.231142,0.063273,1.000000,0.605241,0.002240,0.125904,0.197629,0.128048,0.067171,0.375154,...,0.017108,0.000294,0.075446,0.006616,0.212321,0.079605,0.103514,0.000567,0.412903,0.037793
3,0.117084,0.050087,0.605241,1.000000,0.258950,0.184851,0.107825,0.006480,0.113272,0.003966,...,0.017467,0.028178,0.135369,0.056300,0.015403,0.065649,0.033456,0.061347,0.492638,0.132619
4,0.023589,0.139145,0.002240,0.258950,1.000000,0.576276,0.195270,0.303283,0.311376,0.215175,...,0.295388,0.208111,0.120724,0.367171,0.101900,0.159641,0.041350,0.237027,0.323039,0.393935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.000596,0.130601,0.079605,0.065649,0.159641,0.223029,0.226968,0.428822,0.029926,0.292409,...,0.269728,0.130770,0.391036,0.422273,0.126591,1.000000,0.713786,0.586980,0.035778,0.066119
19996,0.040428,0.094801,0.103514,0.033456,0.041350,0.112644,0.061612,0.341466,0.036033,0.234681,...,0.084802,0.264647,0.448985,0.344776,0.082781,0.713786,1.000000,0.589890,0.064877,0.237533
19997,0.033426,0.023999,0.000567,0.061347,0.237027,0.393753,0.274180,0.290316,0.022542,0.194819,...,0.272016,0.149123,0.479514,0.302177,0.014750,0.586980,0.589890,1.000000,0.001643,0.636501
19998,0.119826,0.154815,0.412903,0.492638,0.323039,0.088577,0.051887,0.600945,0.451017,0.441815,...,0.503325,0.155346,0.043755,0.390158,0.132642,0.035778,0.064877,0.001643,1.000000,0.084025


## Similarity Scores
Using one wine as base comparison for similarity 

In [22]:
# defining function for reccomendation system 
def Similarity_Score(num_components,df1,comparison,column_name):
    Topic_Scores = df1.iloc[:,0:num_components]
    WineA = np.array(Topic_Scores.iloc[comparison])
    Similar = [[np.dot(WineA,np.array(Topic_Scores.iloc[i])),df1['variety'][i],df1['price'][i], 
                df1[column_name][i],df1['title'][i]] for i in range(119955)]
    #np.dot(WineA,np.array(Topic_Scores.iloc[3432]))
    max(Similar)
    TOP = sorted(Similar, reverse=True)
    return TOP,Similar

In [23]:
# chose wine of comparison 
base_line = int(input("Chose Wine #: "))

# Similarity List 
TOP = Similarity_Score(5,Topics,base_line,'description')

# list of similar wines in certain price range 
Scores_Details = pd.DataFrame(TOP[0])

# rename columns
Scores_Details.rename(columns={0: 'SimilarityScore', 1: 'Variety',2:'Price',3:'Description',4:'WineName'}, inplace=True)



Chose Wine #: 367


In [24]:
def Most_Similar(Topics,Score_Details,base,column_name):
    WN = []
    print(Topics[column_name][base])
    print("Price: ",Topics['price'][base])
    print("Wine Name: ",Topics['title'][base])
    print("Variety: ",Topics['variety'][base])
    print("")
    print("Most Similar")
    print("")
    for i in range(3):
        print(Score_Details['Description'][i])
        print("Price: ", Score_Details['Price'][i])
        print("Wine Name: ",Score_Details['WineName'][i])
        print("Variety: ",Score_Details['Variety'][i])
        print("")
        WN.append(Score_Details['WineName'][i])
    return WN

WN = Most_Similar(Topics,Scores_Details,base_line,'description')


Aromas of mint, berry syrup and dense, lemony oak lay a good foundation for the heavy black fruit that comes on the palate atop a plush carpet of toasty, fresh-grained wood. The finish is round, tasty and mouthfilling, while overall the wine seems to dance due to popping acidity. This is Carmenère with a vital streak.
Price:  10.0
Wine Name:  De Martino 1999 Reserva de la Familia Carmenère (Maipo Valley)
Variety:  Carmenère

Most Similar

With its wood aging and rich, concentrated black cherry fruits, this is an impressive wine. Aged for 18 months in wood, it is a special selection that brings out rich fruits, a ripe structure and some aging potential. Drink from 2016.
Price:  35.0
Wine Name:  Domaine de Leyre-Loup 2011 Réserve Louis Leyre-Loup  (Fleurie)
Variety:  Gamay

This is a rich wine, full of rounded black fruits and soft tannins. It is a young style of wine with no wood aging, just the ripe fruits. Drink this wine now.
Price:  10.0
Wine Name:  Casa Santa Vitória 2014 Versátil 

In [13]:
# join string on dash for website url formatting
def add_dash(WN):
    dashed=[]
    for i in range(len(WN)):
        sp = WN[i].split()
        st = '-'.join(sp).lower()
        dashed.append(st)
    return dashed

# remove any accents
import unicodedata
def remove_accents(st_list):
    asci = []
    for i in range(len(st_list)):
        nfkd_form = unicodedata.normalize('NFKD', st_list[i])
        only_ascii = nfkd_form.encode('ASCII', 'ignore')
        asci.append(only_ascii)
    return asci 

# change from byte to str and remove parenthesis 
def change_type_replace(byte_list):
    clean_strings = []
    for i in range(len(byte_list)):
        clean_string = str(only_ascii_list[i]).replace("b'",'').replace("'",'').replace("(",'').replace(")",'')
        clean_strings.append(clean_string)
    return clean_strings


In [14]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd

In [15]:
# scrape alcohol percentage from winemag website 
def get_alch_percentage(clean_strings):
    alch_percentage = []
    for i in range(len(clean_strings)):
        chromedriver = "/Applications/chromedriver" 
        os.environ["webdriver.chrome.driver"] = chromedriver

        url = 'https://www.winemag.com/buying-guide/{}/'.format(clean_strings[i])

        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        driver = webdriver.Chrome(chromedriver)
        driver.get(url)
        driver.page_source[:]
        time.sleep(3)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        soup.find_all()
        alch = soup.find_all(class_ = 'info small-9 columns')
        if alch != []:
            # the issue is right here(below). it goes to the wrong page because the wine doesnt exist 
            # so the code i wrote to extract it is useless and it raises an index error 
            percentage = alch[0].text.replace("\n",'').replace("%",'')
            alch_percentage.append(percentage)
        else:
            alch_percentage.append('0')
        print(i)
        print(alch_percentage)
        driver.close()
    return alch_percentage


In [16]:
#calling functions for scraping
dashes = add_dash(WN)
only_ascii_list = remove_accents(dashes)
clean_strings = change_type_replace(only_ascii_list)
#Alcohol_Percentages = get_alch_percentage(clean_strings)

In [17]:
# cleaning alcohol percentage values 
for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] == 'N/A':
        Alcohol_Percentages[i] = '0'
    Alcohol_Percentages[i] = float(Alcohol_Percentages[i])

count = 0
total = 0
for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] != 0:
        total = total + Alcohol_Percentages[i]
        count += 1
Mean_AVB = total/count
    


for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] == 0:
        Alcohol_Percentages[i] = mean_alch
ABV = pd.DataFrame(Alcohol_Percentages,columns = {"ABV"})


## Finding Most Similar Wines Based on Price Constrains

In [19]:
def apply_price_mask(df,lower,upper):
    mask = (df['Price'] >= float(lower)) & (df['Price'] < float(upper))
    return df[mask]
P_min = input("Enter min Price: ")
P_max = input("Enter max Price: ")
Price = apply_price_mask(Scores_Details,P_min,P_max)

Enter min Price: 20
Enter max Price: 40


In [20]:
# Sorting by Price Range 
Price_Sorted = Price.sort_values(by='SimilarityScore', ascending=False)
Price_Sorted.reset_index(inplace=True,drop=True)

# Similar Wines Given Price Range 

In [21]:
def Most_Similar_Given_Price(Price_Sorted,Topics,base):
    
    Titles = []
    
    print(Topics['description'][base])
    print("Price: ",Topics['price'][base])
    print("Wine Name: ",Topics['title'][base])
    print("Variety: ",Topics['variety'][base])
    print("")
    print("Top 5 Most Similar")
    for i in range(5):
        print("")
        print(Price_Sorted['Description'][i])
        print("Price: ",Price_Sorted['Price'][i])
        print("Name: ",Price_Sorted['WineName'][i])
        print("Variety: ",Price_Sorted['Variety'][i])
        print("")
        Titles.append(Price_Sorted['WineName'][i])
    
    return Titles

Titles = Most_Similar_Given_Price(Price_Sorted,Topics,base_line)

This wine contains some material over 100 years old, but shows no signs of fragility. Instead, it's concentrated through age and should hold in the bottle indefinitely. It's dark coffee-brown in color, with delectable aromas of rancio, dried fig, molasses and black tea, yet despite enormous concentration avoids excessive weight. And it's amazingly complex and fresh on the nearly endless finish.
Price:  350.0
Wine Name:  Chambers Rosewood Vineyards NV Rare Muscat (Rutherglen)
Variety:  Muscat

Top 5 Most Similar

Sugary and simple, with sweet cola, licorice, blackberry and cherry jam flavors. The tannins are very finely structured, but the wine is just too sweet.
Price:  24.0
Name:  South Coast 2005 Rolling Hills Estate Vineyard Syrah (Temecula)
Variety:  Syrah


A simple wine, it tastes soft and slightly sweet, with candied raspberry flavors.
Price:  26.0
Name:  Dancing Creek 2009 Pinot Noir (Santa Cruz Mountains)
Variety:  Pinot Noir


Shows crisp acidity and sweet, jammy orange, pine

In [27]:
JT = Topics.iloc[0:221,0:5]
Regression = pd.concat([JT,ABV],axis=1)

import seaborn as sns
sns.histplot(ABVS)
sns.lineplot(data=ABV.iloc[0:12]

# Topic Analysis

In [21]:

# chosing topic to sort by 
topic = input("What Topic: ")

# defining word matrix 
word_matrix_df = pd.DataFrame(nmf[0].components_, columns=words).T.add_prefix('topic_')

# sorting by topic from Topics Matrix and from word matrix
#Topic_Supplimented = T
Topic_Supplimented = Topics.sort_values(by = topic,ascending=False).reset_index()
WORDS = word_matrix_df.sort_values(by = topic,ascending=False)

What Topic: topic_2


In [24]:
Topic_Supplimented['description'][5]
Topic_Supplimented['variety'][5]
WORDS.head(2)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
fruits,0.0,0.09493,3.1528,0.0,0.508438,0.0,0.107988,0.0,0.0,0.0,...,0.0,0.015406,0.0,0.105936,0.0,0.0,0.0,0.0,0.122973,0.127374
wood,0.0,0.0,2.635541,0.0,0.0,0.0,0.0,0.0,0.0,0.109217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
