In [1]:
from pymongo import MongoClient
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [36]:
import re
from nltk.stem.porter import PorterStemmer

<h3>MongoDB Connection</h3>

In [2]:
def getDBConnection():
    client = MongoClient("localhost:27017")
    db=client.articles
    return db

In [3]:
db=getDBConnection()
articles=db.Articles
row=articles.find_one()

<h3>Okay Just Print First Record<h3>

In [4]:
pprint(row)

{'_id': ObjectId('5c2a2e6b986d090428970616'),
 'author': 'newsfeedback@fool.com (Sean Williams)',
 'content': 'The marijuana industry has had an absolutely game-changing year '
            'in 2018. Canada became the first industrialized country in the '
            'world to green-light recreational marijuana, and a handful of '
            'U.S. states legalized cannabis in some capacity. In other words, '
            'the p… [+8176 chars]',
 'description': 'These pot stocks are likely to create all the buzz in the new '
                'year.',
 'publishedAt': datetime.datetime(2018, 12, 31, 13, 21),
 'source_id': 'the-motely-fool',
 'summarization': 'Although the legal cannabis market is still exceptionally '
                  'young and unproven, here - in no particular order - are 12 '
                  'pot-growing stocks you should be watching in 2019. The '
                  'company currently has 4.3 million square feet of licensed '
                  'production capacity, an

In [5]:
articles.find().count()

  """Entry point for launching an IPython kernel.


106904

<h3>Extract summarisation column from article table</h3>

In [69]:
myresult = articles.find().limit(10000)
dataset=[]
for x in myresult:
    try:
        dataset.append(x["summarization"])
    except Exception as e:
        print(e)

<h3>Dataset Ready</h3>

In [70]:
no_features=5000
no_topics = 3
no_top_words = 50

In [71]:
dataset[0]

"Although the legal cannabis market is still exceptionally young and unproven, here - in no particular order - are 12 pot-growing stocks you should be watching in 2019. The company currently has 4.3 million square feet of licensed production capacity, and anticipates having all 5.6 million square feet licensed by the end of 2019. Expect 2019 to be the year that Aurora finds a beverage, tobacco, or pharmaceutical partner, and don't be surprised if the company's acquisition binge continues. What we do know is that if management meets its production guidance, Aphria will slot in as the third-largest grower by annual yield at 255,000 kilograms. Tilray has close to 3 million square feet in growing capacity that it could develop, which makes it a wild card in terms of peak production. The company anticipates completing the final of three stages of its phase 4 expansion at the Moncton, New Brunswick, facility by October 2019. Despite only 342,000 square feet of growing space, Supreme Cannabis

<h1>Create the Document-Word matrix</h1>

In [100]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,stop_words='english',lowercase=True,
                        strip_accents='ascii',analyzer = 'word',token_pattern='[a-zA-Z0-9]{3,}')
tfidf = tfidf_vectorizer.fit_transform(dataset)

In [101]:
feature_names = tfidf_vectorizer.get_feature_names()

In [102]:
feature_names

['000',
 '000kg',
 '001',
 '012',
 '018',
 '025',
 '030',
 '035',
 '036',
 '038',
 '040',
 '042',
 '044',
 '045',
 '048',
 '050',
 '055',
 '059',
 '060',
 '070',
 '072',
 '079',
 '083',
 '095',
 '100',
 '1000',
 '10000',
 '100kg',
 '100ml',
 '101',
 '1011',
 '1015',
 '102',
 '103',
 '104',
 '105',
 '1057',
 '106',
 '107',
 '108',
 '109',
 '1092',
 '10am',
 '10mg',
 '10ml',
 '10pm',
 '10th',
 '110',
 '1100',
 '111',
 '112',
 '1127',
 '113',
 '1130',
 '11357',
 '11361',
 '1137',
 '114',
 '115',
 '115th',
 '116',
 '117',
 '118',
 '119',
 '11am',
 '11pm',
 '11th',
 '120',
 '1200',
 '121',
 '1214',
 '122',
 '1227',
 '123',
 '124',
 '1244',
 '125',
 '1251',
 '1258',
 '126',
 '1263',
 '127',
 '128',
 '1285',
 '1286',
 '129',
 '1294',
 '12pm',
 '12th',
 '130',
 '1302',
 '131',
 '1313',
 '132',
 '133',
 '1331',
 '135',
 '1351',
 '136',
 '137',
 '138',
 '139',
 '1392',
 '13th',
 '140',
 '1400',
 '1409',
 '141',
 '142',
 '1420',
 '14272',
 '143',
 '144',
 '145',
 '1459',
 '146',
 '1460',
 '147',


In [103]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

<h1>Build LDA model with sklearn</h1>

In [104]:
lda =  LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)

In [106]:
display_topics(lda, feature_names, no_top_words)

Topic 0:
cannabis marijuana medical said state hemp city use new patients states people federal law business legal recreational industry drug california health dispensaries products government cbd legalization canada company year businesses market like research police public dispensary laws county time sales license tax program says department percent pot oregon oil medicinal
Topic 1:
les mydx que hausman quatre vins yazbeck bedard analyzer pour avec oprah est ete enrico qui aerodx spannabis heins handheld une loi vthc jaremowich leur sin este sur embargo fin sensors cynthea flavours surete toujours aerien controle canadienne sujet maintenant creme addison par reneged futura middlebury bouchard hay psoe sparkling
Topic 2:
420 news intel industry marijuana outlet technological impact developments cover world date information stories advances cannabis pertinent constantly rallies evolving reliable abreast inbox signing ensuring delivered globe kept legalization changing directly day dail

In [107]:
import pandas as pd
import numpy as np

<h1>Show Dominant topic of each document</h1>

In [108]:
best_lda_model=lda
data_vectorized=tfidf
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(dataset))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

<h3>List first 10 documents their dominent topic is 2</h3>

In [111]:
df_document_topic[df_document_topic["dominant_topic"]==2].head(10)

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc106,0.07,0.05,0.89,2
Doc134,0.06,0.05,0.89,2
Doc191,0.06,0.05,0.89,2
Doc203,0.07,0.05,0.88,2
Doc214,0.06,0.05,0.89,2
Doc366,0.07,0.05,0.88,2
Doc378,0.07,0.05,0.88,2
Doc598,0.07,0.05,0.89,2
Doc608,0.07,0.05,0.89,2
Doc622,0.06,0.05,0.89,2


<h3>List first 10 documents their dominent topic is 1</h3>

In [114]:
df_document_topic[df_document_topic["dominant_topic"]==1].head(10)

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc7619,0.44,0.49,0.06,1
Doc7671,0.18,0.76,0.06,1
Doc7681,0.21,0.73,0.06,1
Doc9931,0.26,0.66,0.08,1


<h3>List first 10 documents their dominent topic is 0</h3>

In [115]:
df_document_topic[df_document_topic["dominant_topic"]==0].head(10)

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.92,0.04,0.04,0
Doc1,0.94,0.03,0.03,0
Doc2,0.92,0.04,0.04,0
Doc3,0.33,0.33,0.33,0
Doc4,0.93,0.03,0.03,0
Doc5,0.86,0.07,0.07,0
Doc6,0.94,0.03,0.03,0
Doc7,0.93,0.03,0.03,0
Doc8,0.92,0.04,0.04,0
Doc9,0.91,0.05,0.05,0


In [116]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

<h3>Top 15 words of each topic</h3>

In [117]:
topic_keywords = show_topics(tfidf_vectorizer, lda, 15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,cannabis,marijuana,medical,said,state,hemp,city,use,new,patients,states,people,federal,law,business
Topic 1,les,mydx,que,hausman,quatre,vins,yazbeck,bedard,analyzer,pour,avec,oprah,est,ete,enrico
Topic 2,420,news,intel,industry,marijuana,outlet,technological,impact,developments,cover,world,date,information,stories,advances


<h3>Okay lets assign some labels for each topic based on words</h3>

In [118]:
Topics = ["Drug Use","Research","Technological Developments"]
df_topic_keywords["Category"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Category
Topic 0,cannabis,marijuana,medical,said,state,hemp,city,use,new,patients,states,people,federal,law,business,Drug Use
Topic 1,les,mydx,que,hausman,quatre,vins,yazbeck,bedard,analyzer,pour,avec,oprah,est,ete,enrico,Research
Topic 2,420,news,intel,industry,marijuana,outlet,technological,impact,developments,cover,world,date,information,stories,advances,Technological Developments
