# Topic Modeling of company Descriptions

In [1]:
import string
import nltk
import pickle
nltk.download('stopwords')
nltk.download('wordnet')  
nltk.download('omw-1.4')  
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Tutorial

In [3]:
# Creating example documents
doc_1 = "A whopping 96.5 percent of water on Earth is in our oceans, covering 71 percent of the surface of our planet. And at any given time, about 0.001 percent is floating above us in the atmosphere. If all of that water fell as rain at once, the whole planet would get about 1 inch of rain."

doc_2 = "One-third of your life is spent sleeping. Sleeping 7-9 hours each night should help your body heal itself, activate the immune system, and give your heart a break. Beyond that--sleep experts are still trying to learn more about what happens once we fall asleep."

doc_3 = "A newborn baby is 78 percent water. Adults are 55-60 percent water. Water is involved in just about everything our body does."

doc_4 = "While still in high school, a student went 264.4 hours without sleep, for which he won first place in the 10th Annual Great San Diego Science Fair in 1964."

doc_5 = "We experience water in all three states: solid ice, liquid water, and gas water vapor."

# Create corpus
corpus = [doc_1, doc_2, doc_3, doc_4, doc_5]

In [4]:
# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = [clean(doc).split() for doc in corpus]

In [5]:
len(clean_corpus)

5

In [6]:
clean_corpus

[['whopping',
  '965',
  'percent',
  'water',
  'earth',
  'ocean',
  'covering',
  '71',
  'percent',
  'surface',
  'planet',
  'given',
  'time',
  '0001',
  'percent',
  'floating',
  'u',
  'atmosphere',
  'water',
  'fell',
  'rain',
  'once',
  'whole',
  'planet',
  'would',
  'get',
  '1',
  'inch',
  'rain'],
 ['onethird',
  'life',
  'spent',
  'sleeping',
  'sleeping',
  '79',
  'hour',
  'night',
  'help',
  'body',
  'heal',
  'itself',
  'activate',
  'immune',
  'system',
  'give',
  'heart',
  'break',
  'beyond',
  'thatsleep',
  'expert',
  'still',
  'trying',
  'learn',
  'happens',
  'fall',
  'asleep'],
 ['newborn',
  'baby',
  '78',
  'percent',
  'water',
  'adult',
  '5560',
  'percent',
  'water',
  'water',
  'involved',
  'everything',
  'body',
  'doe'],
 ['still',
  'high',
  'school',
  'student',
  'went',
  '2644',
  'hour',
  'without',
  'sleep',
  'first',
  'place',
  '10th',
  'annual',
  'great',
  'san',
  'diego',
  'science',
  'fair',
  '196

In [7]:
from gensim import corpora

In [8]:
# Creating document-term matrix 
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [9]:
from gensim.models import LsiModel

In [10]:
# LSA model
lsa = LsiModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# LSA model
print(lsa.print_topics(num_topics=3, num_words=3))

[(0, '0.555*"water" + 0.489*"percent" + 0.239*"rain"'), (1, '0.361*"sleeping" + 0.215*"hour" + 0.215*"still"'), (2, '-0.562*"water" + 0.231*"planet" + 0.231*"rain"')]


In [11]:
from gensim.models import LdaModel

In [12]:
# LDA model
lda = LdaModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# Results
print(lda.print_topics(num_topics=3, num_words=3))

[(0, '0.103*"water" + 0.047*"percent" + 0.024*"three"'), (1, '0.029*"still" + 0.028*"hour" + 0.026*"sleeping"'), (2, '0.040*"percent" + 0.033*"planet" + 0.031*"rain"')]


#### Topic Modeling

In [13]:
from gensim import corpora

In [14]:
with open('pipes/topic_modeling_pipe', 'rb') as fp:
    pipe7 = pickle.load(fp)

In [21]:
pipe7

[['fourgon',
  'delivers',
  'store',
  'drink',
  'home',
  'order',
  'place',
  'lefourgon.com',
  'beer',
  'juice',
  'soda',
  'water',
  'milk',
  'wine',
  'soup',
  'spirit',
  'deliver',
  'home',
  'free',
  'charge',
  'chosen',
  'niche',
  'next',
  'visit',
  'collect',
  'empty',
  'bottle',
  'return',
  'wash',
  'producer',
  'reuse',
  'zerodechet'],
 ['comptoir',
  'vignes',
  'brand',
  'cellar',
  'specialize',
  'wine',
  'champagne',
  'spirit',
  'specialty',
  'beer',
  'coffee',
  'delicatessen',
  'cellar',
  'differentiate',
  'original',
  'modern',
  'presentation',
  'product',
  'also',
  'basis',
  'advice',
  'adapt',
  'trend',
  'consumption',
  'habit',
  'customer',
  'comptoir',
  'vignes',
  'cellar',
  'offer',
  'clear',
  'warm',
  'allows',
  'discover',
  'wine',
  'simplicity',
  'indulgence',
  'highlight',
  'pairing',
  'provision',
  'recipe',
  'card',
  'regular',
  'event',
  'store',
  'organization',
  'taste',
  'evening',
  'ce

Save json for Streamlit

In [26]:
import json

In [29]:
with open("data/pipe7.json", "w") as outfile:
    json.dump(pipe7, outfile)

In [15]:
dictionary = corpora.Dictionary(pipe7)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in pipe7]

In [16]:
doc_term_matrix

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(0, 1),
  (24, 1),
  (25, 1),
  (29, 3),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 4),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 2),
  (83, 1),
  (84, 1),
  (85, 1)],
 [(17, 1),
  (25, 2

#### LSA

In [17]:
from gensim.models import LsiModel

In [18]:
# LSA model
lsa = LsiModel(doc_term_matrix, num_topics=10, id2word = dictionary)

# LSA model
lsa.print_topics(num_topics=10, num_words=5)

[(0,
  '0.346*"offer" + 0.294*"product" + 0.229*"service" + 0.194*"customer" + 0.189*"make"'),
 (1,
  '-0.594*"product" + 0.234*"service" + 0.204*"real" + 0.185*"training" + 0.184*"support"'),
 (2,
  '-0.489*"service" + -0.358*"customer" + 0.292*"make" + 0.281*"training" + -0.127*"offer"'),
 (3,
  '0.377*"offer" + -0.351*"product" + 0.253*"online" + -0.249*"customer" + 0.222*"price"'),
 (4,
  '-0.480*"product" + -0.397*"real" + -0.372*"estate" + 0.262*"make" + 0.156*"jewelry"'),
 (5,
  '-0.703*"offer" + 0.233*"online" + 0.208*"france" + 0.195*"sale" + 0.192*"store"'),
 (6,
  '0.502*"training" + 0.341*"online" + -0.285*"real" + -0.281*"estate" + 0.240*"professional"'),
 (7,
  '-0.348*"france" + 0.305*"customer" + -0.296*"yacht" + -0.233*"french" + -0.184*"cofrance"'),
 (8,
  '0.359*"service" + -0.310*"customer" + 0.263*"renault" + 0.246*"quality" + 0.196*"vehicle"'),
 (9,
  '-0.475*"wine" + -0.292*"best" + 0.203*"brand" + 0.175*"jewelry" + 0.171*"design"')]

#### LDA 

In [19]:
from gensim.models import LdaModel

In [30]:
# LDA model
lda = LdaModel(doc_term_matrix, num_topics=10, id2word = dictionary)

# Results
lda.print_topics(num_topics=10, num_words=5)

[(0,
  '0.015*"product" + 0.014*"make" + 0.013*"baby" + 0.012*"natural" + 0.011*"bodywork"'),
 (1,
  '0.015*"offer" + 0.010*"product" + 0.009*"renault" + 0.009*"make" + 0.008*"customer"'),
 (2,
  '0.018*"service" + 0.013*"travel" + 0.012*"vehicle" + 0.012*"offer" + 0.011*"trip"'),
 (3,
  '0.013*"make" + 0.007*"company" + 0.007*"station" + 0.007*"personalize" + 0.006*"free"'),
 (4,
  '0.018*"service" + 0.015*"offer" + 0.010*"agency" + 0.009*"company" + 0.008*"business"'),
 (5,
  '0.037*"bike" + 0.028*"electric" + 0.027*"rental" + 0.016*"vehicle" + 0.015*"france"'),
 (6,
  '0.014*"print" + 0.011*"host" + 0.006*"create" + 0.006*"paper" + 0.006*"online"'),
 (7,
  '0.020*"equipment" + 0.011*"part" + 0.009*"offer" + 0.009*"hike" + 0.007*"online"'),
 (8,
  '0.014*"quality" + 0.013*"product" + 0.012*"offer" + 0.012*"make" + 0.010*"design"'),
 (9,
  '0.025*"brand" + 0.022*"online" + 0.021*"store" + 0.020*"jewelry" + 0.018*"accessory"')]

#### Visualize Topics with LDAvis

In [31]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [33]:
vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
vis

In [None]:
pyLDAvis.save_html(prepared_model_data, 'lda.html')