## 1-Text Extraction

In [None]:
import pandas as pd
import os
from pdfminer.high_level import extract_text

metadata = '../wetransfer_b_2022-12-09_1647/B/CLA_meta_from_2018.csv'
df_metadata = pd.read_csv(metadata)
df_jc_number = df_metadata.loc[df_metadata['jc_number'] == 119.0]

file_dict = {}

for subdir, dirs, files in os.walk('../wetransfer_a_2022-12-09_1528/A/119'):
    for file in files:
        path = subdir + '/' + file
        file_dict[file] = path

files = df_jc_number[['jc_number', 'filename']]
files['path'] = files['filename'].map(file_dict) ### map path to PDF files to metadata via filename
files['path'] = files['path'].str.replace('\\', '/')

files_dict = {}

for p in files['path']:   ### loop through each path
    text = extract_text(p)   ### extract pdf text using pdfminer.six
    files_dict[p] = text   ### with path as key, store pdf text in a dictionary
files['pdf_text'] = files['path'].map(files_dict) ### map pdf text to dataframe

files.drop(['path'], axis=1, inplace=True)

## 2-Language Detection

In [None]:
import stanza
from stanza.pipeline.core import Pipeline
from stanza.models.common.doc import Document
from stanza.pipeline.multilingual import MultilingualPipeline

stanza.download(lang="multilingual")
stanza.download(lang="nl")
stanza.download(lang="fr")

text_dict = {}
nl_dict = {}
fr_dict = {}
idx = 0

for text in files['pdf_text']:   ### parse text in pdf text column
    lines_text_clean = []
    nl_list = []
    fr_list = []
    lines_text = text.splitlines()   ### split by line
    for line in lines_text:
        if len(line) > 0:   ### filter out empty lines
            lines_text_clean.append(line)
    """Use Stanza Language Detector to detect language by line"""
    stanza_nlp = Pipeline(lang="multilingual", processors="langid", langid_lang_subset=["nl","fr"])
    docs = [Document([], text=text) for text in lines_text_clean]
    stanza_nlp(docs)
    for doc in docs:
        if doc.lang == 'nl':   ### assign dutch lines to list
            nl_list.append(doc.text)
        else:   ### assign non-dutch to another list
            fr_list.append(doc.text)
    nl_dict[files.iloc[idx]['filename']] = '\n'.join(nl_list)   ### make dictionary of dutch lines
    fr_dict[files.iloc[idx]['filename']] = '\n'.join(fr_list)   ### make dictionary of non-dutch lines
    idx = idx + 1

files['nl_text'] = files['filename'].map(nl_dict)   ### create new column nl_text
files['fr_text'] = files['filename'].map(fr_dict)   ### create new column fr_text

## 3-Clustering

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from collections import defaultdict
from heapq import nlargest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

_stopwords = list(stopwords.words('dutch') + list(punctuation) + ["les","'s","''","``","du","la","par","et","à", "aux","«","le", "des"])
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=_stopwords)

posts = files['nl_text'].to_list()
X = vectorizer.fit_transform(posts)

km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1, verbose=True, random_state=42)
km.fit(X)

np.unique(km.labels_, return_counts=True)
files['class'] = km.labels_.tolist()

text = {}
file_ind = {}
ind = []
for i, cluster in enumerate(km.labels_):
    oneDocument = posts[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument

keywords = {}
counts = {}
for cluster in range(6):
    word_sent = word_tokenize(text[cluster].lower(), language="dutch")
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster] = freq

unique_keys={}
for cluster in range(6):   
    other_clusters = list(set(range(6))-set([cluster]))
    keys_other_clusters = set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique = set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster] = nlargest(15, unique, key=counts[cluster].get)

files['key_words'] = files.apply( lambda row : unique_keys[row['class']], axis = 1)

## 4-Summarization

In [None]:
def get_context(text, freq_words):
    sents = sent_tokenize(text)
    freq = FreqDist(freq_words)
    ranking = defaultdict(int)

    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
                
    sents_idx = nlargest(6, ranking, key=ranking.get)
    return [sents[j].replace('\n', ' ')  for j in sorted(sents_idx)]

dic_words = {
            'cluster_0' : ['premie', 'jaarlijkse', 'december', 'betaald', 'betaalbaar', 'collectieve', 'arbeidsovereenkomst', 'betreffende', 'commission', 'paritair', 'bedienden']
            }
search_words = dic_words['cluster_0']         

files['row_number'] = np.arange(len(files))

## Get text in a column "summary_nl"
class_0 = files.loc[files['class'] == 0]
class_0['summary_nl'] = files.apply( lambda row : get_context(row['nl_text'], search_words), axis = 1)

## Get text as summary example 
rows =[]
for row in class_0['row_number']:
    rows.append(row)

for i in rows:
    text = files['nl_text'].iloc[i]
    context = get_context(text, search_words)
    print('\n'.join(context))
    print('\n*********************************************\n')

number = class_0['jc_number'].values[0]
string_number = str(number)
class_0.to_excel(f'../demo/jc_{string_number[0:3]}_{string_number[4:]}_class_0.xlsx', engine='xlsxwriter')