In [None]:
import numpy as np
import pandas as pd
import spacy
import glob 
import pyLDAvis
import pyLDAvis.sklearn
from wordcloud import WordCloud, STOPWORDS
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from tqdm import tqdm
import matplotlib.pyplot as plt
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from spacy.lang.nl.stop_words import STOP_WORDS
from string import punctuation
from spacy.lang.nl import Dutch


In [None]:
# load a medium sized dutch language model in spacy
nlp = spacy.load('nl_core_news_md')

In [None]:
#Ref for attempting summarization https://www.numpyninja.com/post/text-summarization-through-use-of-spacy-library

In [None]:
myfile = open("TaxRelatedFile.txt")

In [None]:
text = myfile.read()

In [None]:
print(text)

In [None]:
len(text)

In [None]:
doc=nlp(text)

In [None]:
for sent in doc.sents:
    print(sent)

In [None]:
#Take a look at how many words are in the document
len(doc)

In [None]:
#Look document-level attributes
dir(doc)

In [None]:
print(doc[5])

Using spaCy's built-in visualizer to detect named entities in the document

In [None]:
displacy.render(doc, style="ent", jupyter=True)

Look up at label LAW, some tokens related to taxes were identified by spaCy with this label

In [None]:
spacy.explain("LAW")

### Lemmatization

In [None]:
review = str(" ".join([i.lemma_ for i in doc]))

In [None]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

### Parts of Speech tagging

In [None]:
for i in nlp(review):
    print(i, "=>", i.pos_)

In [None]:
#Import puntuaction marks from string and also add additional next line tag in it
punctuation=punctuation+ '\n'

In [None]:
#Tokenize the words from the sentence:
tokens=[token.text for token in doc]
print(tokens)

In [None]:
#Calculating word frequencies from the text after removing stopwords and puntuactions:

stopwords = list(STOP_WORDS)

word_frequencies={}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [None]:
#Print and see word frequencies to know important words.
print(word_frequencies)

In [None]:
#Calculate the maximum frequency and divide it by all frequencies to get normalized word frequencies.
max_frequency=max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word]=word_frequencies[word]/max_frequency

In [None]:
#Printing normalized word frequencies:
#Printing in descending order 

#print(word_frequencies)
w_sorted_keys = sorted(word_frequencies, key=word_frequencies.get, reverse=True)
for w in w_sorted_keys:
    print(w, word_frequencies[w])

In [None]:
#Get sentence tokens 
sentence_tokens= [sent for sent in doc.sents]
print(sentence_tokens)

In [None]:
#Calculate the most important sentences by adding the word frequencies in each sentence.
sentence_scores = {}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():                            
             sentence_scores[sent]=word_frequencies[word.text.lower()]
            else:
             sentence_scores[sent]+=word_frequencies[word.text.lower()]

In [None]:
#Print sentence scores
sentence_scores

In [None]:
#From headhq import nlargest and calculate  30% of text with maximum score.
from heapq import nlargest
select_length=int(len(sentence_tokens)*0.3)
select_length
summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
summary

In [None]:
#Get the summary of text
final_summary=[word.text for word in summary]
final_summary
summary=''.join(final_summary)
summary

In [None]:
len(summary)

In [None]:
# all the texts together
# import glob 

# path = 'text_nl/*.txt'

# all_texts=""

# for file in glob.glob(path):
#     with open(file, encoding='utf-8', errors='ignore') as file_in:
#         text = file_in.read()
#         all_texts+=text
#         lines = text.split('\n')
#         for line in lines:
#             line = nlp(line)
#             for token in line:
#                 print(token)

In [None]:
#len(all_texts)

In [None]:
#docs = list(nlp.pipe(all_texts), n_process=4)
# docs = nlp.pipe(all_texts, n_process=4)

In [None]:
# from collections import Counter

# all_laws = []

# for d in docs:
#     laws = [ent.text for ent in d.ents if ent.label_ == "LAW"]
#     all_laws.extend(laws)

# Counter(all_laws).most_common(2)
    

### Creating a dataframe from files

In [None]:
#all the texts together
def createDF(path):
    """
    This function receives a path where files and merge the files into a dataframe
    
    """

    path = path
    dataframes = []
    df = pd.DataFrame(columns = ["article_content"])

    for file in glob.glob(path):
        with open(file, encoding='utf-8', errors='ignore') as file_in:
            dataframe = file_in.read().replace('\n', '')
            dataframes.append(dataframe)
    to_append = dataframes
    my_series = pd.Series(to_append)
    df["article_content"]= my_series
    
    return df


In [None]:
df = createDF('clean_text_nl/*.txt')

In [None]:
df.head()

In [None]:
df.iloc[[5]]

In [None]:
df.iloc[[6]].to_string()

In [None]:
# # Parser for content
# parser = Dutch()
# punctuations = punctuation
# stopwords = list(STOP_WORDS)
# def spacy_tokenizer(sentence):
#     mytokens = parser(sentence)
#     mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
#     print(mytokens)
#     mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
#     mytokens = " ".join([i for i in mytokens])
#     return mytokens

In [None]:
#ref: https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
stopwords = list(STOP_WORDS)
punctuations = punctuation
def normalize(comment, lowercase, remove_stopwords):
    """
    This function is used to normalize the text, remove stopwords and punctuations
    
    """
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stopwords and lemma not in punctuations):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

In [None]:
tqdm.pandas()
df["processed_content"] = df["article_content"].progress_apply(normalize, lowercase=True, remove_stopwords=True)
#df["processed_content"] = df["article_content"].progress_apply(spacy_tokenizer)

In [None]:
df

In [None]:
df["processed_content"][6]

### Topic-modeling

In [None]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=stopwords, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["processed_content"])

In [None]:
#Using tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(df["processed_content"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
NUM_TOPICS = 10

In [None]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

### Using LDA

In [None]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [None]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

### Using NMF

In [None]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [None]:
# Keywords for topics clustered by Non-Negative Matrix Factorization

print("NMF Model:")
selected_topics(nmf, vectorizer)

### Using Latent Semantic Indexing Model using Truncated SVD

In [None]:
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [None]:
# Keywords for topics clustered by Latent Semantic Indexing
print("LSI Model:")
selected_topics(lsi, vectorizer)

### Visualizing LDA results with pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [None]:
#ref https://ratulesrar3.github.io/sotu-approval-analysis/
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        data = ' '.join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        title = 'Topic {}'.format(topic_idx+1)
        show_wordcloud(data, title)   

In [None]:
no_top_words = 13
display_topics(lda, tfidf_feature_names, no_top_words)