# Topic Modeling of company Descriptions

In [14]:
import string
import nltk
import pickle
nltk.download('stopwords')
nltk.download('wordnet')  
nltk.download('omw-1.4')  
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Tutorial

In [1]:
# Creating example documents
doc_1 = "A whopping 96.5 percent of water on Earth is in our oceans, covering 71 percent of the surface of our planet. And at any given time, about 0.001 percent is floating above us in the atmosphere. If all of that water fell as rain at once, the whole planet would get about 1 inch of rain."

doc_2 = "One-third of your life is spent sleeping. Sleeping 7-9 hours each night should help your body heal itself, activate the immune system, and give your heart a break. Beyond that--sleep experts are still trying to learn more about what happens once we fall asleep."

doc_3 = "A newborn baby is 78 percent water. Adults are 55-60 percent water. Water is involved in just about everything our body does."

doc_4 = "While still in high school, a student went 264.4 hours without sleep, for which he won first place in the 10th Annual Great San Diego Science Fair in 1964."

doc_5 = "We experience water in all three states: solid ice, liquid water, and gas water vapor."

# Create corpus
corpus = [doc_1, doc_2, doc_3, doc_4, doc_5]

In [4]:
# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = [clean(doc).split() for doc in corpus]

In [6]:
len(clean_corpus)

5

In [7]:
clean_corpus

[['whopping',
  '965',
  'percent',
  'water',
  'earth',
  'ocean',
  'covering',
  '71',
  'percent',
  'surface',
  'planet',
  'given',
  'time',
  '0001',
  'percent',
  'floating',
  'u',
  'atmosphere',
  'water',
  'fell',
  'rain',
  'once',
  'whole',
  'planet',
  'would',
  'get',
  '1',
  'inch',
  'rain'],
 ['onethird',
  'life',
  'spent',
  'sleeping',
  'sleeping',
  '79',
  'hour',
  'night',
  'help',
  'body',
  'heal',
  'itself',
  'activate',
  'immune',
  'system',
  'give',
  'heart',
  'break',
  'beyond',
  'thatsleep',
  'expert',
  'still',
  'trying',
  'learn',
  'happens',
  'fall',
  'asleep'],
 ['newborn',
  'baby',
  '78',
  'percent',
  'water',
  'adult',
  '5560',
  'percent',
  'water',
  'water',
  'involved',
  'everything',
  'body',
  'doe'],
 ['still',
  'high',
  'school',
  'student',
  'went',
  '2644',
  'hour',
  'without',
  'sleep',
  'first',
  'place',
  '10th',
  'annual',
  'great',
  'san',
  'diego',
  'science',
  'fair',
  '196

In [8]:
from gensim import corpora

In [9]:
# Creating document-term matrix 
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [10]:
from gensim.models import LsiModel

In [11]:
# LSA model
lsa = LsiModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# LSA model
print(lsa.print_topics(num_topics=3, num_words=3))

[(0, '0.555*"water" + 0.489*"percent" + 0.239*"planet"'), (1, '-0.361*"sleeping" + -0.215*"hour" + -0.215*"still"'), (2, '-0.562*"water" + 0.231*"planet" + 0.231*"rain"')]


In [12]:
from gensim.models import LdaModel

In [13]:
# LDA model
lda = LdaModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# Results
print(lda.print_topics(num_topics=3, num_words=3))

[(0, '0.054*"water" + 0.051*"percent" + 0.025*"body"'), (1, '0.074*"water" + 0.026*"percent" + 0.021*"three"'), (2, '0.028*"sleeping" + 0.023*"still" + 0.023*"hour"')]


### Test on our Trustpilot Data

In [15]:
with open("pipes/pipe4", "rb") as fp:   # Unpickling
    pipe4 = pickle.load(fp)

### Concat

In [55]:
pipe5 = []
for company in pipe4:
    pipe5.append(" ".join([x for lst in company for x in lst]))

In [56]:
pipe5[0]

'le fourgon delivers stored drinks home order placed lefourgon.com beers juices sodas water milk wines soups spirits co. deliver home free charge chosen niche next visit collect empty bottles return washed producer reuse zerodechet'

In [57]:
pipe5[1]

'comptoir des vignes brand cellars specializing wines champagnes spirits specialty beers teas coffees delicatessens cellars differentiated original modern presentation products also basis advice adapted new trends consumption habits customers comptoir des vignes cellar offers clear warm setting allows discover wines simplicity indulgence highlighting pairings provision recipe cards regular events store organization tasting evenings 50 cellars france mission satisfy consumers whatever needs desires wide range products services good value money also thanks passion wine merchants'

### Lemmatize words

In [58]:
# lemma = WordNetLemmatizer()
# pipe6 = []
# for company in pipe5[:3]:
#     print(company)
#     print([lemma.lemmatize(word, pos='a') for word in company])
#     print([lemma.lemmatize(word) for word in company])

In [59]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [60]:
lemma = WordNetLemmatizer()
pipe6 = []
for company in pipe5:
    # lemm = nltk.pos_tag(nltk.word_tokenize(company))
    # print(lemm)
    pipe6.append([lemma.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(company)])


In [51]:
pipe6[0]

['le',
 'fourgon',
 'delivers',
 'store',
 'drink',
 'home',
 'order',
 'place',
 'lefourgon.com',
 'beer',
 'juice',
 'soda',
 'water',
 'milk',
 'wine',
 'soup',
 'spirit',
 'co.',
 'deliver',
 'home',
 'free',
 'charge',
 'chosen',
 'niche',
 'next',
 'visit',
 'collect',
 'empty',
 'bottle',
 'return',
 'wash',
 'producer',
 'reuse',
 'zerodechet']