## Import Libraries

In [1]:
import sqlite3
import os
import pandas as pd
from langdetect import detect
import spacy
from spacy import displacy
from spacy_langdetect import LanguageDetector
import stanza
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline
from textblob import TextBlob

stanza.download(lang="multilingual")
stanza.download(lang="nl")
stanza.download(lang="fr")

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 2.68MB/s]                    
2022-12-20 08:53:55 INFO: Downloading default packages for language: multilingual (multilingual) ...
2022-12-20 08:53:55 INFO: File exists: C:\Users\ARCH GLOBAL\stanza_resources\multilingual\default.zip
2022-12-20 08:53:55 INFO: Finished downloading models and saved to C:\Users\ARCH GLOBAL\stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, ?B/s]                        
2022-12-20 08:53:55 INFO: Downloading default packages for language: nl (Dutch) ...
2022-12-20 08:54:00 INFO: File exists: C:\Users\ARCH GLOBAL\stanza_resources\nl\default.zip
2022-12-20 08:54:07 INFO: Finished downloading models and saved to C:\Users\ARCH GLOBAL\stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-re

## Open and Connect to Database

In [2]:
database = '../../database'
connect = sqlite3.connect(database)

## Build List of File Paths

In [3]:
path_list = []

for subdir, dir, files in os.walk('../../data'):
    for file in files:
        path = subdir + '/' + file
        path_list.append(path)

## Detect Language of PDF Text Column and Assign to Separate Columns

In [100]:
for path in path_list:   ### loop through each CSV file
    working_data = pd.read_csv(path)
    
    text_dict = {}
    nl_dict = {}
    fr_dict = {}
    idx = 0
    number = working_data.at[0, 'jc_number']

    for text in working_data['pdf_text']:   ### parse text in pdf text column
        lines_text_clean = []
        nl_list = []
        fr_list = []
        lines_text = text.splitlines()   ### split by line
        for line in lines_text:
            if len(line) > 0:   ### filter out empty lines
                lines_text_clean.append(line)
        """Use Stanza Language Detector to detect language by line"""
        stanza_nlp = Pipeline(lang="multilingual", processors="langid", langid_lang_subset=["nl","fr"])
        docs = [Document([], text=text) for text in lines_text_clean]
        stanza_nlp(docs)
        for doc in docs:
            if doc.lang == 'nl':   ### assign dutch lines to list
                nl_list.append(doc.text)
            else:   ### assign non-dutch to another list
                fr_list.append(doc.text)
        nl_dict[working_data.iloc[idx]['filename']] = '\n'.join(nl_list)   ### make dictionary of dutch lines
        fr_dict[working_data.iloc[idx]['filename']] = '\n'.join(fr_list)   ### make dictionary of non-dutch lines
        idx = idx + 1
    
    working_data['nl_text'] = working_data['filename'].map(nl_dict)   ### create new column nl_text
    working_data['fr_text'] = working_data['filename'].map(fr_dict)   ### create new column fr_text
    working_data.to_sql(name='files_parsed', con=connect, if_exists='append')   ### store data in database
    string_number = str(number)
    working_data.to_csv(f'../../data_parsed/jc_{string_number[0:3]}_{string_number[4:]}.csv')   ### save data as csv file

2022-12-20 15:16:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 38.5MB/s]                    
2022-12-20 15:16:22 INFO: Loading these models for language: multilingual ():
| Processor | Package |
-----------------------
| langid    | ud      |

2022-12-20 15:16:22 INFO: Use device: cpu
2022-12-20 15:16:22 INFO: Loading: langid
2022-12-20 15:16:22 INFO: Done loading processors!
2022-12-20 15:16:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 24.1MB/s]               