# NMF Topic Modeling With Dot Product Similarity Reccomender 


In [2]:
import pandas as pd 
import streamlit as st 
import altair as alt
from PIL import Image 

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
import warnings
warnings.filterwarnings("ignore")

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [3]:
# read in wine data
# drop duplicates and fill na with zeros 
WINE = pd.read_csv('WINENLP.csv')
WINE.drop_duplicates(subset=['description'],inplace=True)
WINE.fillna(0,inplace=True)
shp = WINE.shape
shp[0]
WINES = WINE[['description','country','province','variety','price','title']]
WINES.reset_index(inplace=True,drop=True)



In [4]:
def augmented_corpus(WINE,shp):
    TEXT = [[WINE['country'][i] for i in range(shp)],[WINE['province'][i] for i in range(shp)],
            [WINE['variety'][i] for i in range(shp)]]
   
    # concatenate features into sentance to add to descriptions
    TO_ADD = ["This Wine is from " + str(TEXT[0][j]) + " in " + str(TEXT[1][j]) + 
          " and is a " + str(TEXT[2][j]) for j in range(shp)]
   

    DESCRIPTIONS_PLUS = [WINE['description'][i] + " " + TO_ADD[i] for i in range(shp)]
   
    return DESCRIPTIONS_PLUS

DP = augmented_corpus(WINES,shp[0])
WINES["PlusDescriptions"] = DP


In [5]:
# define corpus'
CORPUS1 = WINES['description']
CORPUS2 = WINES['PlusDescriptions']


In [6]:
# augment english stop words list 
STOP = stopwords.words('english')
STOP += ['notes','wine','drink','region','nan','nose',
        'like','made','shows','parts','style','followed','amounts','mouth','flavors']


In [7]:
# instantiate Tfid vectorizer
TF_IDF = TfidfVectorizer(stop_words=STOP,token_pattern = r'(?u)\b[A-Za-z]+\b')

# fit transform returns Sparse Matrix 
Description_Matrix = TF_IDF.fit_transform(CORPUS1)

# words used
words = TF_IDF.get_feature_names()

Description_Matrix


<119955x29722 sparse matrix of type '<class 'numpy.float64'>'
	with 2693261 stored elements in Compressed Sparse Row format>

In [8]:
def fit(df):
    num_components = int(input("How many topics: "))
    # instantiating Non Negative Matrix Factorization Model
    nmf = NMF(n_components=num_components,random_state=84)
    # fitting model 
    nmf.fit(df)
    return nmf , num_components
nmf = fit(Description_Matrix)

How many topics: 30


In [10]:
Topic_Matrix = nmf[0].transform(Description_Matrix)
Topic_Matrix_df = pd.DataFrame(Topic_Matrix).add_prefix('topic_')
Topics = pd.concat([Topic_Matrix_df, WINES], axis=1)
Topics.dropna(inplace=True)
Topics.reset_index(inplace=True,drop=True)



In [11]:
from sklearn.metrics.pairwise import cosine_similarity
Topic_Matrix_Partial = Topic_Matrix[:20000]

In [57]:
Cosine_Matrix = cosine_similarity(Topic_Matrix_Partial)
import pickle
with open('cosine_sim_array.pkl', 'wb') as f:
    pickle.dump(Cosine_Matrix, f)

In [29]:
map_wines = {WINES['title'][i]:i for i in range(20000)}

In [30]:
selection = map_wines['Nicosia 2013 Vulkà Bianco  (Etna)']

In [40]:
Cosine_List = list(Cosine_Matrix[selection])

Cosine_Pair = [[Cosine_List[i],i,WINES['price'][i]] for i in range(20000)]
Cosine_Pair_Sorted = sorted(Cosine_Pair,reverse=True)

In [45]:
cs = pd.DataFrame(Cosine_Pair_Sorted, columns = {'Similarity','Index','Price'})

In [47]:
new_cs = cs.rename(columns={'Index':'Similarity','Price':'Index','Similarity':'Price'})

In [51]:
price_mask = (new_cs['Price'] <=40) & (new_cs['Price'] >=20)

In [54]:
n_cs new_cs[price_mask]['Index']

1        1526
7        6871
14       4112
16       1391
17       7074
         ... 
19989    1021
19992     771
19994     642
19997     473
19999      71
Name: Index, Length: 7723, dtype: int64

In [22]:
map_wines = {WINES['title'][i]:i for i in range(20000)}

In [None]:
MAKE_REC = [[COS_MATRIX[map_wines[wine_selection]][i],i] for i in range(20000)]

In [38]:
COS_MATRIX = pd.read_pickle("COSMAT.pkl")
COS_MATRIX[0]

0        1.000000
1        0.009529
2        0.237180
3        0.084969
4        0.026763
           ...   
19995    0.001972
19996    0.018191
19997    0.000000
19998    0.155912
19999    0.030917
Name: 0, Length: 20000, dtype: float64

In [39]:
list(COS_MATRIX[0])

[0.9999999999999999,
 0.009529328194539182,
 0.2371798457861164,
 0.08496898974754748,
 0.026762668845769815,
 0.13594299845055013,
 0.19420370743716173,
 0.11419026493695683,
 0.08130371903666288,
 0.22402237609612125,
 0.007935864917736228,
 0.4946578471631417,
 0.06659385595911109,
 0.1338878822732761,
 0.2743461858339128,
 0.2710966655386662,
 0.10402513529460028,
 0.02047678932289176,
 0.12146267939254216,
 0.10480025520203533,
 0.08169821158442249,
 0.09651178772671525,
 0.600345928060175,
 0.27339220582095897,
 0.1626795213537437,
 0.09197321428167762,
 0.5292499279118122,
 0.31599601186857873,
 0.31636259361492275,
 0.373813675689054,
 0.12351669857300616,
 0.3270675584749007,
 0.2730372088655977,
 0.016404532877215904,
 0.5412251782657785,
 0.18703116950720208,
 0.2782000215866476,
 0.30467101407868724,
 0.14590691452505872,
 0.21583447639750467,
 0.5690287894141933,
 0.15698715335256733,
 0.12810215832291508,
 0.7102452574119155,
 0.22729375054772216,
 0.017488612011274617,
 

## Similarity Scores
Using one wine as base comparison for similarity 

In [10]:
# defining function for reccomendation system 
def Similarity_Score(num_components,df1,comparison,column_name):
    Topic_Scores = df1.iloc[:,0:num_components]
    WineA = np.array(Topic_Scores.iloc[comparison])
    Similar = [[np.dot(WineA,np.array(Topic_Scores.iloc[i])),df1['variety'][i],df1['price'][i], 
                df1[column_name][i],df1['title'][i]] for i in range(119955)]
    #np.dot(WineA,np.array(Topic_Scores.iloc[3432]))
    max(Similar)
    TOP = sorted(Similar, reverse=True)
    return TOP,Similar

In [11]:
# chose wine of comparison 
base_line = int(input("Chose Wine #: "))

# Similarity List 
TOP = Similarity_Score(5,Topics,base_line,'description')

# list of similar wines in certain price range 
Scores_Details = pd.DataFrame(TOP[0])

# rename columns
Scores_Details.rename(columns={0: 'SimilarityScore', 1: 'Variety',2:'Price',3:'Description',4:'WineName'}, inplace=True)



Chose Wine #: 4734


In [30]:
def Most_Similar(Topics,Score_Details,base,column_name):
    WN = []
    print(Topics[column_name][base])
    print("Price: ",Topics['price'][base])
    print("Wine Name: ",Topics['title'][base])
    print("Variety: ",Topics['variety'][base])
    print("")
    print("Most Similar")
    print("")
    for i in range(3):
        print(Score_Details['Description'][i])
        print("Price: ", Score_Details['Price'][i])
        print("Wine Name: ",Score_Details['WineName'][i])
        print("Variety: ",Score_Details['Variety'][i])
        print("")
        WN.append(Score_Details['WineName'][i])
    return WN

WN = Most_Similar(Topics,Scores_Details,base_line,'description')


Bright, forward fruit and spice characterize this very rich, complex wine. The aromas veer toward banana and ripe peach and pear, with oriental spices, vanilla, and toast. In the mouth, the flavors are fruity, with a leesy streak. High acidity makes it a little sharp, while the texture is creamy. All in all, it's very balanced, except for that sharpness, which a year or two of aging should mellow.
Price:  40.0
Wine Name:  Stag's Leap Wine Cellars 1998 Beckstoffer Ranch Chardonnay (Napa Valley)
Variety:  Chardonnay

Most Similar

With its wood aging and rich, concentrated black cherry fruits, this is an impressive wine. Aged for 18 months in wood, it is a special selection that brings out rich fruits, a ripe structure and some aging potential. Drink from 2016.
Price:  35.0
Wine Name:  Domaine de Leyre-Loup 2011 Réserve Louis Leyre-Loup  (Fleurie)
Variety:  Gamay

This is a rich wine, full of rounded black fruits and soft tannins. It is a young style of wine with no wood aging, just the 

In [13]:
# join string on dash for website url formatting
def add_dash(WN):
    dashed=[]
    for i in range(len(WN)):
        sp = WN[i].split()
        st = '-'.join(sp).lower()
        dashed.append(st)
    return dashed

# remove any accents
import unicodedata
def remove_accents(st_list):
    asci = []
    for i in range(len(st_list)):
        nfkd_form = unicodedata.normalize('NFKD', st_list[i])
        only_ascii = nfkd_form.encode('ASCII', 'ignore')
        asci.append(only_ascii)
    return asci 

# change from byte to str and remove parenthesis 
def change_type_replace(byte_list):
    clean_strings = []
    for i in range(len(byte_list)):
        clean_string = str(only_ascii_list[i]).replace("b'",'').replace("'",'').replace("(",'').replace(")",'')
        clean_strings.append(clean_string)
    return clean_strings


In [14]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd

In [15]:
# scrape alcohol percentage from winemag website 
def get_alch_percentage(clean_strings):
    alch_percentage = []
    for i in range(len(clean_strings)):
        chromedriver = "/Applications/chromedriver" 
        os.environ["webdriver.chrome.driver"] = chromedriver

        url = 'https://www.winemag.com/buying-guide/{}/'.format(clean_strings[i])

        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        driver = webdriver.Chrome(chromedriver)
        driver.get(url)
        driver.page_source[:]
        time.sleep(3)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        soup.find_all()
        alch = soup.find_all(class_ = 'info small-9 columns')
        if alch != []:
            # the issue is right here(below). it goes to the wrong page because the wine doesnt exist 
            # so the code i wrote to extract it is useless and it raises an index error 
            percentage = alch[0].text.replace("\n",'').replace("%",'')
            alch_percentage.append(percentage)
        else:
            alch_percentage.append('0')
        print(i)
        print(alch_percentage)
        driver.close()
    return alch_percentage


In [16]:
#calling functions for scraping
dashes = add_dash(WN)
only_ascii_list = remove_accents(dashes)
clean_strings = change_type_replace(only_ascii_list)
#Alcohol_Percentages = get_alch_percentage(clean_strings)

In [17]:
# cleaning alcohol percentage values 
for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] == 'N/A':
        Alcohol_Percentages[i] = '0'
    Alcohol_Percentages[i] = float(Alcohol_Percentages[i])

count = 0
total = 0
for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] != 0:
        total = total + Alcohol_Percentages[i]
        count += 1
Mean_AVB = total/count
    


for i in range(len(Alcohol_Percentages)):
    if Alcohol_Percentages[i] == 0:
        Alcohol_Percentages[i] = mean_alch
ABV = pd.DataFrame(Alcohol_Percentages,columns = {"ABV"})


## Finding Most Similar Wines Based on Price Constrains

In [18]:
def apply_price_mask(df,lower,upper):
    mask = (df['Price'] >= float(lower)) & (df['Price'] < float(upper))
    return df[mask]
P_min = input("Enter min Price: ")
P_max = input("Enter max Price: ")
Price = apply_price_mask(Scores_Details,P_min,P_max)

Enter min Price: 20
Enter max Price: 30


In [19]:
# Sorting by Price Range 
Price_Sorted = Price.sort_values(by='SimilarityScore', ascending=False)
Price_Sorted.reset_index(inplace=True,drop=True)

# Similar Wines Given Price Range 

In [20]:
def Most_Similar_Given_Price(Price_Sorted,Topics,base):
    
    Titles = []
    
    print(Topics['description'][base])
    print("Price: ",Topics['price'][base])
    print("Wine Name: ",Topics['title'][base])
    print("Variety: ",Topics['variety'][base])
    print("")
    print("Top 5 Most Similar")
    for i in range(30):
        print("")
        print(Price_Sorted['Description'][i])
        print("Price: ",Price_Sorted['Price'][i])
        print("Name: ",Price_Sorted['WineName'][i])
        print("Variety: ",Price_Sorted['Variety'][i])
        print("")
        Titles.append(Price_Sorted['WineName'][i])
    
    return Titles

Titles = Most_Similar_Given_Price(Price_Sorted,Topics,base_line)

Bright, forward fruit and spice characterize this very rich, complex wine. The aromas veer toward banana and ripe peach and pear, with oriental spices, vanilla, and toast. In the mouth, the flavors are fruity, with a leesy streak. High acidity makes it a little sharp, while the texture is creamy. All in all, it's very balanced, except for that sharpness, which a year or two of aging should mellow.
Price:  40.0
Wine Name:  Stag's Leap Wine Cellars 1998 Beckstoffer Ranch Chardonnay (Napa Valley)
Variety:  Chardonnay

Top 5 Most Similar

An attractive pale pink wine that has bright red currant fruits, crisp acidity and a lively mousse. It is so fresh, fruity and deliciously crisp and refreshing. The aftertaste is light, bright and full of acidity.
Price:  21.0
Name:  Sauvion NV Brut Rosé Sparkling (Crémant de Loire)
Variety:  Sparkling Blend


The wine is ripe with juicy black-currant fruits and balanced acidity. It has some spice from the wood aging, just enough to smooth out the bright 

In [27]:
JT = Topics.iloc[0:221,0:5]
Regression = pd.concat([JT,ABV],axis=1)

import seaborn as sns
sns.histplot(ABVS)
sns.lineplot(data=ABV.iloc[0:12]

# Topic Analysis

In [21]:

# chosing topic to sort by 
topic = input("What Topic: ")

# defining word matrix 
word_matrix_df = pd.DataFrame(nmf[0].components_, columns=words).T.add_prefix('topic_')

# sorting by topic from Topics Matrix and from word matrix
#Topic_Supplimented = T
Topic_Supplimented = Topics.sort_values(by = topic,ascending=False).reset_index()
WORDS = word_matrix_df.sort_values(by = topic,ascending=False)

What Topic: topic_2


In [24]:
Topic_Supplimented['description'][5]
Topic_Supplimented['variety'][5]
WORDS.head(2)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
fruits,0.0,0.09493,3.1528,0.0,0.508438,0.0,0.107988,0.0,0.0,0.0,...,0.0,0.015406,0.0,0.105936,0.0,0.0,0.0,0.0,0.122973,0.127374
wood,0.0,0.0,2.635541,0.0,0.0,0.0,0.0,0.0,0.0,0.109217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0











# Regression analysis ABV% predicting 

In [28]:
Regression.head(3)

In [169]:
# defining features and targets 
# features are topics from 5 componenet topic model 
# target is price of wine 
X = Regression.iloc[:,:30]
y = Regression.iloc[:,30:31]


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression #LinearRegressionCV, Ridge
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score




X, X_test, y, y_test = train_test_split(X, y, test_size=.25, random_state=99)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size =.25, random_state =3)


LR = LinearRegression()
LR.fit(X_train, y_train)
y_val_pred = LR.predict(X_val)
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("R^2 Val: ", r2_score(y_val,y_val_pred))
print("RSME Val: ",val_rmse)


R^2 Val:  0.3076361953566439
RSME Val:  0.6554502889166332


In [171]:
print("Coeifficients: ",LR.coef_)

Coeifficients:  [[ 15.66430375 -17.90274316   9.17487174   6.2265334   -6.55184509]]
