**import packages**

In [2]:
#import packages
import re
import numpy as np
import pandas as pd
from pprint import pprint
from glob import glob
import funcy as fp

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline



# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
#prepare stopwords

# Mallet contains more generic words. So using Mallet list
stop_words = []
with open('/home/santhilata/Dropbox/TopicModeling/malletStopwords.txt','r') as f:
    s = str(f.read()).split(' ')
    
for word in s:
    stop_words.append(word)
    
stop_words.extend(['honourable', 'minister', 'prime', 'cabinet', 'parliament','document','her','majesty','hon',
                   'secretary','apos','may',
                  'secretariat','government','cab','spreadsheet','ref','state', 'confidential', 'mpsecretary','report','make','would',
                  'apo', 'cab','spreadsheet','images','ref','therefore','could'])

In [13]:
path_to_folder = '/media/santhilata/Santhi_backup/Projects/TopicModeling/Hansard/1960/apr'
# quick and dirty....
EMAIL_REGEX = re.compile(r"[a-z0-9\.\+_-]+@[a-z0-9\._-]+\.[a-z]*")
FILTER_REGEX = re.compile(r"[^a-z '#]")
TOKEN_MAPPINGS = [(EMAIL_REGEX, "#email"), (FILTER_REGEX, ' ')]

def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)
    return res.split()
    
def tokenize(lines, token_size_filter=2):
    tokens = fp.mapcat(tokenize_line, lines)
    return [t for t in tokens if len(t) > token_size_filter]
    

def load_doc(filename):
    group, doc_id = filename.split('/')[-2:]
    with open(filename, errors='ignore') as f:
        doc = f.readlines()
    return {'doc': doc,
            'tokens': tokenize(doc),
            }


docs = pd.DataFrame(list(map(load_doc, glob(path_to_folder+'/*.txt'))))
docs.head()



Unnamed: 0,doc,tokens
0,[],[]
1,"[\n, asked the Minister of Labour ...","[asked, the, minister, labour, when, ministry,..."
2,[],[]
3,[],[]
4,"[\n, asked the Minister of Labour ...","[asked, the, minister, labour, aware, that, th..."


In [14]:
docs['length'] = docs['doc'].apply(lambda x: len(x) )
dindex = docs[docs['length']==0 ].index
docs.drop(dindex, inplace=True)
#

In [16]:
docs.head()


Unnamed: 0,doc,tokens,length
1,"[\n, asked the Minister of Labour ...","[asked, the, minister, labour, when, ministry,...",12
4,"[\n, asked the Minister of Labour ...","[asked, the, minister, labour, aware, that, th...",18
5,[ asked the Minister of Defence what further c...,"[asked, the, minister, defence, what, further,...",2
6,"[\n, asked the Minister of Pension...","[asked, the, minister, pensions, and, national...",23
7,[ asked the Minister of Housing and Local Gove...,"[asked, the, minister, housing, and, local, go...",2


In [17]:
# Convert to list
data = docs.tokens.values.tolist()

In [18]:
#remove stopwords, lemmatize
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

In [20]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])


# Build the bigram and trigram models
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print(len(data_lemmatized[2]))




[['ask', 'labour', 'ministry', 'labour', 'training_centre', 'establish', 'northern_region', 'friend', 'present', 'review', 'local', 'centre', 'friend', 'convey', 'friend', 'fact', 'north', 'await', 'outcome', 'review', 'bear_mind', 'opportunity', 'implement', 'statement', 'good', 'north_east', 'coast', 'friend', 'implement', 'pledge', 'feed', 'wait', 'convey', 'friend', 'friend', 'bear_mind', 'take', 'final', 'decision', 'matter', 'horn', 'friend', 'require', 'attitude', 'local', 'trade', 'association', 'consultation', 'object', 'mind', 'take', 'place']]
38
