# ABC News Dataset

In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer

from nltk.corpus import stopwords

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

import scipy as sp
import numpy as np
import pickle
%matplotlib inline

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
!head abcnews-date-text.csv

publish_date,headline_text
20030219,aba decides against community broadcasting licence
20030219,act fire witnesses must be aware of defamation
20030219,a g calls for infrastructure protection summit
20030219,air nz staff in aust strike for pay rise
20030219,air nz strike to affect australian travellers
20030219,ambitious olsson wins triple jump
20030219,antic delighted with record breaking barca
20030219,aussie qualifier stosur wastes four memphis match
20030219,aust addresses un security council over iraq


In [3]:
df = pd.read_csv('abcnews-date-text.csv')

In [4]:
n_samples = len(df)
n_samples

1103665

In [5]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Cleaning - Preprocessing 

In [6]:
def remove_stopwords(sentence, stopwords):
    return [x for x in sentence if x not in stopwords]

def stem_words(tokens, stemmer):
    return [stemmer.stem(x) for x in tokens]

In [7]:
# String to lower
df["USE_OF_PROCEEDS"] = df["headline_text"].str.lower()

In [8]:
# Tokenize - this adds the square brackets
tokenizer = RegexpTokenizer(r'\w+')
df["USE_OF_PROCEEDS"] = df["USE_OF_PROCEEDS"].apply(tokenizer.tokenize)


In [9]:
#Remove stopwords
df["USE_OF_PROCEEDS"] = df["USE_OF_PROCEEDS"].apply(remove_stopwords, args=[stopwords.words('english')])

In [10]:
# Stem words --- Identify roots to supress multiplicity -- e.g. inversion, invertir..
# common NLP technique to reduce topically similar words to their root.
stemmer = SnowballStemmer('english')
df["USE_OF_PROCEEDS"] = df["USE_OF_PROCEEDS"].apply(stem_words, args=[stemmer])

In [11]:
df.head()

Unnamed: 0,publish_date,headline_text,USE_OF_PROCEEDS
0,20030219,aba decides against community broadcasting lic...,"[aba, decid, communiti, broadcast, licenc]"
1,20030219,act fire witnesses must be aware of defamation,"[act, fire, wit, must, awar, defam]"
2,20030219,a g calls for infrastructure protection summit,"[g, call, infrastructur, protect, summit]"
3,20030219,air nz staff in aust strike for pay rise,"[air, nz, staff, aust, strike, pay, rise]"
4,20030219,air nz strike to affect australian travellers,"[air, nz, strike, affect, australian, travel]"


In [12]:
#We just need this colum
training_docset = df['USE_OF_PROCEEDS']

In [13]:
#Flatten in one single string all words i.e. training_docset panda series has to
#conform to the datastructure of e.g. the fetch_20newsgroups dataset in sklearn
data_samples = [item for sublist in training_docset.tolist() for item in sublist]
type(data_samples)
data_samples[0]

'aba'

# LDA/HMF Model and document-term matrix Construction (Sklearn only)

We are goinbg to use sklearn only, its lighter. (e.g. AWS lambda applications)

In [14]:
n_features = 1000 
n_samples 

1103665

Extracting tf features for LDA. Remove words ocurring in only one doc or in 95% of the total

In [15]:
#When building the vocabulary ignore terms that have a document frequency strictly 
#higher than the given threshold (corpus-specific stop words)

tf_vectorizer = CountVectorizer(max_features=n_features,max_df=0.95, min_df=2) 

Beware: the standard definition of the tf's and the sklearn implementation differe, see https://scikit-learn.org/stable/modules/feature_extraction.html

In [16]:
tf= tf_vectorizer.fit_transform(data_samples)
tf.shape

(5864176, 1000)

In [17]:
# Use tf-idf features for NMF.
tfidf_vectorizer = TfidfVectorizer(max_features=n_features,max_df=0.95, min_df=2)


In [18]:
tfidf= tfidf_vectorizer.fit_transform(data_samples)
tfidf.shape

(5864176, 1000)

In [19]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

### LDA: Attention: iterations or pases 

In [20]:
n_components = 5  # Number of Topics
n_top_words = 6
model=1

if model==1:
   print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."  % (n_samples, n_features))
   lda = LatentDirichletAllocation(n_components=n_components, max_iter=2,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
   t0 = time()
   lda.fit(tf)
   print("done in %0.3fs." % (time() - t0))
   print("\nTopics in LDA model:")
   tf_feature_names = tf_vectorizer.get_feature_names()
   print_top_words(lda, tf_feature_names, n_top_words)

elif model==2:

  # Fit the NMF model
  print("Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
  t0 = time()
  nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
  print("donee in %0.3fs." % (time() - t0))

  print("\nTopics in NMF model (Frobenius norm):")
  tfidf_feature_names = tfidf_vectorizer.get_feature_names()
  print_top_words(nmf, tfidf_feature_names, n_top_words)
    

Fitting LDA models with tf features, n_samples=1103665 and n_features=1000...
done in 793.959s.

Topics in LDA model:
Topic #0: australian trump melbourn world sa adelaid
Topic #1: us govern win day attack kill
Topic #2: australia new polic wa court call
Topic #3: say year nsw hous perth canberra
Topic #4: man sydney charg elect queensland death



Perplexity!

New South Wales? Sa, South Africa?

*Model performance diagnostics* using the  perplexity and the score (log-likelihood). 
A model with higher score (log-likelihood) and lower perplexity (exp(-1. * log-likelihood per word)) 
is considered to be good:

In [21]:
print("Metrics for the model: Perplexity=%f and score=%f..."
      % (lda.perplexity(tf), lda.score(tf)))

Metrics for the model: Perplexity=827.634822 and score=-24181108.178084...


### NMF

In [26]:
n_components = 5  # Number of Topics
n_top_words = 6
model=2

if model==1:
   print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."  % (n_samples, n_features))
   lda = LatentDirichletAllocation(n_components=n_components, max_iter=2,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
   t0 = time()
   lda.fit(tf)
   print("done in %0.3fs." % (time() - t0))
   print("\nTopics in LDA model:")
   tf_feature_names = tf_vectorizer.get_feature_names()
   print_top_words(lda, tf_feature_names, n_top_words)

elif model==2:

  # Fit the NMF model
  print("Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
  t0 = time()
  nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
  print("donee in %0.3fs." % (time() - t0))

  print("\nTopics in NMF model (Frobenius norm):")
  tfidf_feature_names = tfidf_vectorizer.get_feature_names()
  print_top_words(nmf, tfidf_feature_names, n_top_words)
    

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=1103665 and n_features=1000...
donee in 113.926s.

Topics in NMF model (Frobenius norm):
Topic #0: polic zone festiv first firm firefight
Topic #1: man zone fee firm firefight fire
Topic #2: new zone festiv first firm firefight
Topic #3: plan zone first firm firefight fire
Topic #4: call festiv fish first firm firefight



# Export the model

In [27]:
filename1 = 'lda_news.sav'
pickle.dump(lda, open(filename1, 'wb'))


In [28]:
filename3 = 'tf_news.sav'
pickle.dump(tf, open(filename3, 'wb'))

In [29]:
filename4 = 'tfve_news.sav'
pickle.dump(tf_vectorizer, open(filename4, 'wb'))