In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
npr = npr[['description']]

In [4]:
npr

Unnamed: 0,description
0,"Aromas include tropical fruit, broom, brimston..."
1,"This is ripe and fruity, a wine that is smooth..."
2,"Tart and snappy, the flavors of lime flesh and..."
3,"Pineapple rind, lemon pith and orange blossom ..."
4,"Much like the regular bottling from 2012, this..."
...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...
129967,Citation is given as much as a decade of bottl...
129968,Well-drained gravel soil gives this wine its c...
129969,"A dry style of Pinot Gris, this is crisp with ..."


In [8]:
npr['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
# maxdf what it does is when you're building out the vocabulary we're going to ignore certain terms that have really
# high document frequency so this essentially gets rid of terms that are really common across a lot of the
# documents so you can pass in a number between 0 and 1 here so such as
# 90 or 0.9
# min_df : the % of words that occur minimum number of times

In [7]:
doc_term_matrix = tfidf.fit_transform(npr['description'])

In [8]:
doc_term_matrix

<129971x20340 sparse matrix of type '<class 'numpy.float64'>'
	with 3049599 stored elements in Compressed Sparse Row format>

* 129971x20340 is documents by words

In [9]:
from sklearn.decomposition import NMF

In [11]:
NMF = NMF(n_components=5, random_state=42)

In [12]:
NMF.fit(doc_term_matrix)

In [None]:
# Grab the vocabulary of words

In [13]:
len(tfidf.get_feature_names_out())

20340

In [14]:
type(tfidf.get_feature_names_out())

numpy.ndarray

In [15]:
tfidf.get_feature_names_out()[15000]

'ribs'

In [16]:
import random

random_word_id = random.randint(0, 20340 )

tfidf.get_feature_names_out()[random_word_id]

'interact'

In [26]:
for index, topic in enumerate(NMF.components_):
    print(f"The top 15 words for TOP #{index}")
    print([tfidf.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print("\n")

The top 15 words for TOP #0
['merlot', 'notes', 'cherry', 'sauvignon', 'chocolate', 'aromas', 'sweet', 'plum', 'blend', 'blackberry', 'fruit', 'cabernet', 'oak', 'finish', 'flavors']


The top 15 words for TOP #1
['years', 'dense', 'black', 'age', 'firm', 'structure', 'aging', 'fruit', 'drink', 'wood', 'tannins', 'fruits', 'rich', 'ripe', 'wine']


The top 15 words for TOP #2
['fruit', 'acidity', 'fresh', 'flavors', 'aromas', 'lime', 'green', 'finish', 'pear', 'palate', 'lemon', 'peach', 'citrus', 'white', 'apple']


The top 15 words for TOP #3
['clove', 'opens', 'licorice', 'dried', 'spice', 'red', 'berry', 'pepper', 'offers', 'alongside', 'aromas', 'tannins', 'palate', 'cherry', 'black']


The top 15 words for TOP #4
['fruits', 'aftertaste', 'texture', 'bright', 'attractive', 'fresh', 'ready', 'soft', 'red', 'drink', 'light', 'crisp', 'acidity', 'wine', 'fruity']




In [18]:
topic_results = NMF.transform(doc_term_matrix)

In [19]:
topic_results[0].round(2)

array([0.  , 0.  , 0.03, 0.02, 0.  ])

In [20]:
topic_results[0].argmax()

2

In [22]:
topic_results.argmax(axis=1)

array([2, 1, 2, ..., 1, 1, 1])

In [23]:
npr['topic'] = topic_results.argmax(axis=1)

In [24]:
npr

Unnamed: 0,description,topic
0,"Aromas include tropical fruit, broom, brimston...",2
1,"This is ripe and fruity, a wine that is smooth...",1
2,"Tart and snappy, the flavors of lime flesh and...",2
3,"Pineapple rind, lemon pith and orange blossom ...",2
4,"Much like the regular bottling from 2012, this...",0
...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,2
129967,Citation is given as much as a decade of bottl...,0
129968,Well-drained gravel soil gives this wine its c...,1
129969,"A dry style of Pinot Gris, this is crisp with ...",1


In [25]:
npr.topic.value_counts()

topic
2    33910
0    33460
1    25449
3    21873
4    15279
Name: count, dtype: int64