## NLTK and RAKE Text Mining Tests from a Corpus of Born Digital Archival Content

A factory that runs some NLTK, RAKE, to autoindex and other incomplete text mining experiments as contrasted to using spaCY

In [None]:
import pandas as pd
import numpy as np
import scipy as sp

import warnings 
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

import RAKE
import operator
from rake_nltk import Rake


from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


## Set up workspace, generate the corpus, do some EDA

In [None]:
corpus_root = '/path/to/utf-8/textfiles/'

In [None]:
from collections import defaultdict
from pathlib import Path

In [None]:
# create a corpus by walking through a directory of files, open each file and read filename and text into a 2 column dataframe
corpus = defaultdict(list)
for file in Path(corpus_root).iterdir():
    with open(file, "r") as file_open:
        corpus["file_name"].append(file.name)
        corpus["text"].append(file_open.read())
df = pd.DataFrame(corpus)


print(df)
#print(corpus['file_name'])

In [None]:
# replace \n newlines with a space
df['text'] = df['text'].replace(r'\n',' ', regex=True)
print(df)

In [None]:
df['text'].head()

In [None]:
#Fetch wordcount for each text
df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df[['text','word_count']].head()



In [None]:
##Descriptive statistics of word counts
df.word_count.describe()


In [None]:
#Identify common words
freq = pd.Series(' '.join(df['text']).split()).value_counts()[:20]
freq

In [None]:
#Identify uncommon words
freq1 =  pd.Series(' '.join(df 
         ['text']).split()).value_counts()[-20:]
freq1

In [None]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))

In [None]:
stop_words = set(stopwords.words("english"))

corpus = []
for i in range(0, 610):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    #ps=PorterStemmer()
    ps=SnowballStemmer('english')
    
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
#View a corpus item
corpus[22]

In [None]:
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=100,
                          max_font_size=50, 
                          random_state=42,
                        scale=3,
                         ).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1, figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#fig.savefig("word1.png", dpi=900)

## Vectorize and build vocabulary

In [None]:
#use the CountVectoriser to tokenise the text and build a vocabulary of known words. 
#We first create a variable “cv” of the CountVectoriser class, 
#and then evoke the fit_transform function to learn and build the vocabulary.

cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [None]:
list(cv.vocabulary_.keys())[:22]

## N-Gram generation

In [None]:
#We can use the CountVectoriser to visualise the top 20 unigrams, bi-grams and tri-grams.

#Function to generate most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]

#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]

#Barplot of most freq words
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)

In [None]:
#Function to generate most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

top2_words = get_top_n2_words(corpus, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
#print(top2_df)

#Barplot of most freq Bi-grams
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=60)

In [None]:
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), 
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
#print(top3_df)

#Barplot of most freq Tri-grams
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=75)

## Generate autoabstracting/keywords for a document using tf-idf scoring

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc=corpus[22]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

In [None]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,15)
 
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

## Generate keywords for the whole corpus using tf-idf scoring

In [None]:
# corpus code to come

## Generate keyphrases using RAKE

In [None]:
stop_dir = '/path/to/utf-8/textfiles/SmartStoplist.txt'
rake_object = RAKE.Rake(stop_dir)

In [None]:
def Sort_Tuple(tup):
    tup.sort(key = lambda x: x[1])
    return tup

In [None]:
#run function on a single document from the corpus
keywords = Sort_Tuple(rake_object.run(corpus[22]))

#sort the tuple by scores from highest to lowest; highest score is more important
keywords.sort(key=lambda tup: tup[1], reverse=True)

#print results tab-delimited form, keywords|rake score
for q,w in keywords:
    print(f"{q}\t{w}")
    
#print('keywords:', keywords)


In [None]:
#sort the tuple by scores from highest to lowest; highest score is more important
keywords.sort(key=lambda tup: tup[1], reverse=True)
print(keywords)

In [None]:
#make a little more readable
for q,w in keywords:
    print(f"{q}\t{w}")