In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
#https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn

In [2]:
df = pd.read_csv('glencorePR.csv')

In [3]:
glencore = df.drop(['_source.title', '_score', '_source.doctype', '_type', '_index', '_source.url', '_source.feedurl'], axis = 1)
glencore = glencore.rename(index=str, columns={"_source.title_rss": "title", "_source.teaser_rss": "content"})

In [23]:
data = glencore.content.values.tolist()

In [24]:
data = [re.sub('www.*?', '', str(i)) for i in data]

In [25]:
data = [re.sub('<.*?>', '', str(i)) for i in data]

In [26]:
data = [re.sub('\t.*?', '', str(i)) for i in data] 

In [35]:
len(data)

136

In [27]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xijiahu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/xijiahu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(data):
    stop_free = " ".join([str(i) for i in data.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in data]

In [29]:
doc_clean[0]

['glencore',
 'plc',
 'today',
 'announced',
 'board',
 'recommending',
 'aggregate',
 'distribution',
 'us020',
 'per',
 'share',
 'respect',
 '2017',
 'financial',
 'year',
 'ended',
 '31',
 'december',
 'distribution',
 'declared',
 'paid',
 'u',
 'dollar',
 'although',
 'shareholder',
 'jersey',
 'register',
 'able',
 'elect',
 'receive',
 'distribution',
 'payment',
 'pound',
 'sterling',
 'euro',
 'swiss',
 'franc',
 'shareholder',
 'hold',
 'share',
 'jersey',
 'register',
 'computershare',
 'hk',
 'nominee',
 'receive',
 'distribution',
 'payment',
 'hong',
 'kong',
 'dollar',
 'converted',
 'jersey',
 'applicable',
 'exchange',
 'rate',
 'reference',
 'date',
 'shareholder',
 'johannesburg',
 'register',
 'receive',
 'distribution',
 'south',
 'african',
 'rand',
 'subject',
 'shareholder',
 'approval',
 'distribution',
 'made',
 'capital',
 'contribution',
 'reserve',
 'company',
 'two',
 'equal',
 'tranche',
 '010',
 'payable',
 'first',
 'six',
 'month',
 '2018',
 'financia

In [30]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data)

In [31]:
data_dense = data_vectorized.todense()

In [32]:
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  21.345113387820884 %


In [50]:
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=136,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=136, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [51]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -171497.8567193696
Perplexity:  238.60081237745078
{'batch_size': 136,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'n_topics': None,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [52]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

In [53]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
Doc3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5
Doc4,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,4
Doc5,0.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc6,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.0,0.27,0.0,5
Doc7,0.0,0.0,0.0,0.0,0.07,0.07,0.0,0.0,0.86,0.0,8
Doc8,0.1,0.0,0.0,0.0,0.0,0.69,0.0,0.0,0.21,0.0,5
Doc9,0.0,0.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [54]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,8,57
1,3,30
2,5,22
3,4,10
4,6,8
5,0,6
6,1,3


In [55]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

Unnamed: 0,000,019,027,0800,100,129,150,155,1646,1800,...,with,within,words,work,world,would,year,years,york,zinc
Topic0,0.573087,0.375187,0.357957,0.412262,0.426769,0.384114,0.596202,0.356188,0.348909,0.328019,...,15.40729,3.239852,0.566287,0.567322,1.004487,0.520492,1.567167,4.699661,0.350408,5.439121
Topic1,0.319146,0.312643,0.324758,0.415829,0.330305,0.312526,0.36635,0.331677,0.278651,0.303348,...,0.501187,0.32077,0.319521,0.299578,0.328124,0.387267,0.321138,0.332523,0.359364,0.30153
Topic2,0.283145,0.245818,0.274829,0.298486,0.322455,0.323899,0.369707,0.273467,0.311402,0.288445,...,0.460847,0.294818,0.293471,0.336221,0.348316,0.308167,0.42809,0.273617,0.277887,0.272203
Topic3,0.402272,0.28101,0.325069,0.30203,0.329461,0.312233,0.276366,0.292196,0.293453,0.264945,...,0.481153,0.32878,0.358147,0.281277,0.311073,0.29785,0.266702,0.340679,0.373343,0.326129
Topic4,0.876144,0.406437,0.358703,0.509115,0.505885,1.941225,0.715788,0.602916,0.404121,0.414237,...,5.742331,0.434972,0.526939,0.484807,0.761122,2.157783,2.625998,1.997331,0.350673,2.1167


In [56]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,and,our,the,billion,ldquo,for,glencore,metals,with,adjusted,marketing,net,portfolio,rdquo,industrial
Topic 1,nbsp,glencore,709,the,com,kong,hong,charles,martin,company,watenphul,fewings,further,plc,shares
Topic 2,glencore,nbsp,and,the,for,are,rsquo,coal,com,regulatory,will,rdquo,than,that,ldquo
Topic 3,,and,the,glencore,are,for,will,nbsp,any,rsquo,companies,quot,used,ldquo,limited
Topic 4,the,nbsp,glencore,and,that,has,will,made,production,com,non,board,709,executive,further
Topic 5,the,and,glencore,are,nbsp,for,will,distribution,phone,free,local,date,kong,hong,companies
Topic 6,the,000,offer,purchase,ldquo,rdquo,for,amount,notes,december,2016,has,company,tender,and
Topic 7,quot,nbsp,the,and,glencore,are,will,hong,any,free,kong,com,rdquo,south,local
Topic 8,the,and,glencore,nbsp,for,are,rsquo,with,rdquo,ldquo,com,production,companies,will,quot
Topic 9,portfolio,have,279,likewise,logistics,right,footprint,subject,3320,collectively,well,547,129,six,regulatory


In [57]:
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=15, random_state=100).fit_predict(lda_output)

# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components
lda_output_svd = svd_model.fit_transform(lda_output)

# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))

Component's weights: 
 [[ 0.03  0.    0.    0.02  0.01  0.05  0.02  0.    1.    0.  ]
 [ 0.04  0.01  0.01  0.1   0.03  0.99  0.02  0.01 -0.05  0.01]]
Perc of Variance Explained: 
 [0.36 0.2 ]
