# Corex 

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
import warnings
warnings.filterwarnings("ignore")

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [2]:
# read in wine data
# drop duplicates and fill na with zeros 
WINE = pd.read_csv('winedesc.csv')
WINE.drop_duplicates(subset=['description'],inplace=True)
WINE.fillna(0,inplace=True)
shp = WINE.shape
shp[0]
WINES = WINE[['description','country','province','variety','price','title']]
WINES.reset_index(inplace=True)
del WINES['index']

In [11]:
def augmented_corpus(WINE,shp):
    TEXT = [[WINE['country'][i] for i in range(shp)],[WINE['province'][i] for i in range(shp)],
            [WINE['variety'][i] for i in range(shp)]]
   
    # concatenate features into sentance to add to descriptions
    TO_ADD = ["This Wine is from " + str(TEXT[0][j]) + " in " + str(TEXT[1][j]) + 
          " and is a " + str(TEXT[2][j]) for j in range(shp)]
   

    DESCRIPTIONS_PLUS = [WINE['description'][i] + " " + TO_ADD[i] for i in range(shp)]
   
    return DESCRIPTIONS_PLUS

DP = augmented_corpus(WINES,shp[0])
WINES["PlusDescriptions"] = DP

In [12]:
CORPUS1 = WINES['description']
CORPUS2 = WINES['PlusDescriptions']

In [13]:
# augment english stop words list 
STOP = stopwords.words('english')
STOP += ['notes','wine','drink','region','nan','nose',
        'like','made','shows','parts','style','followed','amounts','mouth']

In [14]:
# instantiate Tfid vectorizer
TF_IDF = TfidfVectorizer(stop_words=STOP,token_pattern = r'(?u)\b[A-Za-z]+\b')

# fit transform returns Sparse Matrix 
Description_Matrix = TF_IDF.fit_transform(CORPUS2)

# words used
words = TF_IDF.get_feature_names()

Description_Matrix

<119955x29948 sparse matrix of type '<class 'numpy.float64'>'
	with 3129297 stored elements in Compressed Sparse Row format>

In [20]:
# Initializing and fitting Corex Topic Model
topic_model = ct.Corex(n_hidden=10, words=words, seed=89)
topic_model.fit(Description_Matrix, words=words, docs=CORPUS2)

# Topics 
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: france,bordeaux,portugal,portuguese,fruits,burgundy,us,ready,wood,douro
1: italy,tuscany,alongside,piedmont,nebbiolo,sangiovese,veneto,sicily,sardinia,opens
2: spain,northern,argentina,mendoza,province,tempranillo,chile,feels,rubbery,herbal
3: romania,plavac,mali,croatia,dealu,mare,viile,timisului,romanian,feteasca
4: moselle,luxembourgeoise,luxembourg,brandi,giguiere,jl,gadd,coster,erdener,ehlen
5: hank,beckmeyer,kaw,insania,tate,musar,geoff,suma,jeune,gps
6: chumash,tribe,tara,gomez,descendent,sevtap,worker,quincy,quimera,quartz
7: brussel,sprouts,ab,pleasureable,pleasurably,plasters,planning,planing,pizzerias,pixy
8: abandoned,posted,posssibly,portet,portends,porer,popularized,politically,polarity,poivre
9: abbott,potention,portugieser,portland,population,populated,populate,popularizing,pound,pony


In [19]:
# Let's check out topic : graphics
topic_model.get_top_docs(topic=0, n_docs=10)

[('The second wine of fourth-growth Beychevelle is named after a 17th century owner, French Admiral de la Valette, who insisted passing ships in the Gironde estuary salute his riverfront property. The wine is a proper second wine, firm while also showing the way it will develop relatively quickly. Juicy black currants and generous tannins are balanced with acidity and a dry core. This fine wine will be ready to drink from 2021. This Wine is from France in Bordeaux and is a Bordeaux-style Red Blend',
  -3.89910326248355e-13),
 ("Taking its name from the Chevaliers d'Arce who guarded this land in the 12th and 13th centuries, the small property is now under the watchful eye of Sylviane Garcin-Cathiard. This wine is still young and just balancing out. Firm tannins are shot through with ripe berry fruits and with crisp acidity. It is a wine that is will mature well, so drink from 2019 and consider this property a rising star. This Wine is from France in Bordeaux and is a Bordeaux-style Red 