In [2]:
import pandas as pd

In [3]:
npr = pd.read_csv('winemag-data-130k-v2.csv')

In [5]:
npr = npr[['description']]

In [6]:
npr

Unnamed: 0,description
0,"Aromas include tropical fruit, broom, brimston..."
1,"This is ripe and fruity, a wine that is smooth..."
2,"Tart and snappy, the flavors of lime flesh and..."
3,"Pineapple rind, lemon pith and orange blossom ..."
4,"Much like the regular bottling from 2012, this..."
...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...
129967,Citation is given as much as a decade of bottl...
129968,Well-drained gravel soil gives this wine its c...
129969,"A dry style of Pinot Gris, this is crisp with ..."


In [8]:
npr['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
# maxdf what it does is when you're building out the vocabulary we're going to ignore certain terms that have really
# high document frequency so this essentially gets rid of terms that are really common across a lot of the
# documents so you can pass in a number between 0 and 1 here so such as
# 90 or 0.9
# min_df : the % of words that occur minimum number of times

In [12]:
doc_term_matrix = cv.fit_transform(npr['description'])

In [13]:
doc_term_matrix

<129971x20340 sparse matrix of type '<class 'numpy.int64'>'
	with 3049599 stored elements in Compressed Sparse Row format>

* 129971x20340 is documents by words

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
LDA = LatentDirichletAllocation(n_components=10, random_state=42)

In [16]:
LDA.fit(doc_term_matrix)

In [None]:
# Grab the vocabulary of words

In [18]:
len(cv.get_feature_names_out())

20340

In [19]:
type(cv.get_feature_names_out())

numpy.ndarray

In [20]:
cv.get_feature_names_out()[15000]

'ribs'

In [23]:
import random

random_word_id = random.randint(0, 20340 )

cv.get_feature_names_out()[random_word_id]

'candia'

In [None]:
# Grab the topics

In [24]:
len(LDA.components_)

10

In [25]:
type(LDA.components_)

numpy.ndarray

In [26]:
LDA.components_.shape

(10, 20340)

In [27]:
LDA.components_

array([[ 2.40777685,  0.1       ,  0.10000777, ...,  0.10000267,
         0.10000169,  0.1       ],
       [ 0.10001058,  0.10001714,  1.099979  , ...,  0.10000893,
         0.11764699,  0.1       ],
       [15.49750635,  3.6254816 ,  0.10000797, ...,  0.10000031,
         0.10000722,  0.1       ],
       ...,
       [ 3.55680152,  0.1000114 ,  0.10000104, ...,  0.10000524,
         0.10000094,  0.1       ],
       [ 1.44351386,  0.10000358,  0.10000224, ...,  0.10000118,
         0.10010298,  0.1       ],
       [ 0.10000685,  1.17207269,  0.1       , ...,  0.1000359 ,
         6.73014708,  0.1002451 ]])

In [28]:
single_topic = LDA.components_[0]

In [29]:
single_topic.argsort()

array([17344, 18414, 15995, ...,  4751,   511, 20023])

In [30]:
# ARGSORT returns INDEX POSITIONS SORTED FROM LEAST -> GREATEST
# TOP 10  VALUES (10 GREATEST VALUES)
# LAST 10 VALUES OF ARGSORT
single_topic.argsort()[-10:]

array([16626, 14564,  7179, 10354,  7536,  7613,  5811,  4751,   511,
       20023])

In [31]:
top_ten_words = single_topic.argsort()[-10:]

In [32]:
for index in top_ten_words:
    print(cv.get_feature_names_out()[index])

soft
red
flavors
light
fresh
fruity
drink
crisp
acidity
wine


In [None]:
# Grab the highest probability of words per topic

In [33]:
for index, topic in enumerate(LDA.components_):
    print(f"The top 15 words for TOP #{index}")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print("\n")
    print("\n")

The top 15 words for TOP #0
['fruit', 'ready', 'ripe', 'bright', 'texture', 'soft', 'red', 'flavors', 'light', 'fresh', 'fruity', 'drink', 'crisp', 'acidity', 'wine']




The top 15 words for TOP #1
['franc', 'cherry', 'wine', 'spice', 'chocolate', 'blackberry', 'aromas', 'tannins', 'dark', 'fruit', 'merlot', 'sauvignon', 'black', 'blend', 'cabernet']




The top 15 words for TOP #2
['good', 'notes', 'oak', 'bit', 'wine', 'green', 'feels', 'plum', 'herbal', 'berry', 'palate', 'fruit', 'aromas', 'finish', 'flavors']




The top 15 words for TOP #3
['soft', 'ripe', 'black', 'pinot', 'oak', 'drink', 'blackberry', 'sweet', 'rich', 'cherries', 'tannins', 'dry', 'cherry', 'wine', 'flavors']




The top 15 words for TOP #4
['shows', 'aromas', 'soft', 'dark', 'spice', 'oak', 'flavors', 'bodied', 'red', 'finish', 'vineyard', 'black', 'cherry', 'fruit', 'wine']




The top 15 words for TOP #5
['years', 'rich', 'french', 'vintage', 'drink', 'pinot', 'toast', 'acidity', 'vanilla', 'chardonnay', 'n

In [34]:
topic_results = LDA.transform(doc_term_matrix)

In [36]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.96, 0.  ])

In [37]:
topic_results[0].argmax()

8

In [38]:
npr['topic'] = topic_results.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npr['topic'] = topic_results.argmax(axis=1)


In [39]:
npr

Unnamed: 0,description,topic
0,"Aromas include tropical fruit, broom, brimston...",8
1,"This is ripe and fruity, a wine that is smooth...",7
2,"Tart and snappy, the flavors of lime flesh and...",8
3,"Pineapple rind, lemon pith and orange blossom ...",9
4,"Much like the regular bottling from 2012, this...",3
...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,9
129967,Citation is given as much as a decade of bottl...,5
129968,Well-drained gravel soil gives this wine its c...,7
129969,"A dry style of Pinot Gris, this is crisp with ...",7


In [40]:
npr.topic.value_counts()

topic
2    15818
4    15506
7    15444
8    14958
6    14879
3    13714
9    13678
0    10876
5     8636
1     6462
Name: count, dtype: int64