In [2]:
import re
import numpy as np

from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
import os
#The OS module in Python provides a way of using operating system dependent functionality. 
#The functions that the OS module provides allows you to interface with the underlying operating system 
#that Python is running on – be that Windows, Mac or Linux.

from os import listdir
from os.path import isfile, join


In [3]:
import nltk as nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christanasescu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stopwords = nltk.corpus.stopwords.words('french')

stopwords.append('’')
stopwords.append('«')
stopwords.append('»')

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = _pre_clean(tokens)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]
    #tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
def _pre_clean(list_of_text):
        '''
        preliminary cleaning of the text
        - remove new line character i.e. \n or \r
        - remove tabs i.e. \t
        - remove extra spaces
        '''
        cleaned_list = []
        for text in list_of_text:
            # print("original:", text)
            text = text.replace('\\n', ' ')
            text = text.replace('\\r', ' ')
            text = text.replace('\\t', ' ')
            pattern = re.compile(r'\s+')
            text = re.sub(pattern, ' ', text)
            text = text.strip()
            text = text.lower()
            # check for empty strings
            if text != '' and text is not None:
                cleaned_list.append(text)

        return cleaned_list

In [7]:
pwd

'/Users/christanasescu'

In [8]:

HOME = os.getcwd()

In [9]:
TEXTS_DIR = HOME + "/Chaire_Altissia_0/"

filelabels = {}

texts_data = []

files = [f for f in os.listdir(TEXTS_DIR) if os.path.isfile(os.path.join(TEXTS_DIR, f))]

import string
from string import punctuation

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

tokens_total = []

count = -1
 
os.chdir(TEXTS_DIR)
    
for f in files:
    #os.chdir(TEXTS_DIR)
    with open(f, "r", encoding='utf-8', errors = 'ignore') as openf:
        tokens = []
        count = count + 1
        filelabels[count] = os.path.basename(openf.name)
        for line in openf:
            sent_text = nltk.sent_tokenize(line)
            for sentence in sent_text:
                tokens1 = tokenize(sentence)
                tokens1 = [item.translate(remove_punct_map)
                      for item in tokens1]
                #filter_object = filter(lambda x: x != "", tokens1)
                tokens1 = [x for x in tokens1 if x!= ""]
                for token in tokens1:
                    tokens.append(token)
                    tokens_total.append(token)
                #if random.random() > .99:
                #print(tokens)
    #print(tokens_total)
    texts_data.append(tokens)

print(filelabels)

{0: 'bar_max_elskamp.txt', 1: 'bergen_lumieres_xiv.txt', 2: 'de_rijcke_ma_defectuosite.txt', 3: 'mer_charles_ducal_trans_pierre_geron_et_al.txt', 4: 'vielle_bruxelles.txt', 5: 'wouters_memorial.txt'}


In [10]:

len(tokens_total)

1950

In [11]:
tokens_total = [x for x in tokens_total if x not in stopwords]

In [12]:

len(tokens_total)

1950

In [13]:
from collections import Counter

Count_total = Counter(tokens_total)

In [14]:

print(Count_total)

Counter({'bruxelles': 33, 'plus': 16, 'sans': 14, 'comme': 11, 'entre': 10, 'ville': 9, 'yeux': 7, 'vie': 7, 'mer': 7, 'heures': 7, 'parle': 7, 'fait': 7, 'temps': 7, 'encore': 7, 'jour': 6, 'ciel': 6, 'langue': 6, 'leurs': 6, 'chaque': 6, 'toujours': 6, 'brussel': 6, 'monsieur': 5, 'sous': 5, 'amante': 5, 'tous': 5, 'charnière': 5, 'corps': 5, 'jours': 5, 'passe': 5, 'faut': 5, 'non': 5, 'faim': 5, 'dit': 5, 'ying': 4, 'thé': 4, 'doigts': 4, 'feu': 4, 'ceux': 4, 'place': 4, 'mère': 4, 'terre': 4, 'ferme': 4, 'nul': 4, 'chair': 4, 'main': 4, 'présent': 4, 'monde': 4, 'celui': 4, 'toute': 4, 'tête': 4, 'vers': 4, 'jai': 4, 'dont': 4, 'tout': 4, 'rien': 4, 'vend': 3, 'bout': 3, 'bois': 3, 'lune': 3, 'front': 3, 'tandis': 3, 'bien': 3, 'autour': 3, 'amour': 3, 'dieu': 3, 'pierre': 3, 'bouche': 3, 'lumière': 3, 'bras': 3, 'ventre': 3, 'dépose': 3, 'hors': 3, 'jamais': 3, 'contre': 3, 'mains': 3, 'larmes': 3, 'voyelles': 3, 'estce': 3, 'lorsque': 3, 'jardins': 3, 'juste': 3, 'écrit': 3, 'nu

In [15]:

import pyperclip as clip


In [16]:
clip.copy(f"{Count_total}")

In [None]:
# Command+V into a page/word/txt file [or clip.paste() to print it here, but in this case it is too large a list to print]

In [17]:

for i in range(6):
    print(len(texts_data[i]))

105
482
198
89
364
712


In [19]:

for i in range(6):
    texts_data[i] = [x for x in texts_data[i] if x not in stopwords]

In [20]:
for i in range(6):
    print(len(texts_data[i]))

105
482
198
89
364
712


In [22]:

Count_total_0 = Counter(texts_data[0])

In [23]:
print(Count_total_0)

Counter({'monsieur': 4, 'ying': 4, 'thé': 4, 'vend': 3, 'saké': 2, 'yeux': 2, 'bien': 2, 'madame': 2, 'yiang': 2, 'boutique': 1, 'bout': 1, 'quai': 1, 'assis': 1, 'robe': 1, 'couleur': 1, 'prune': 1, 'comptoir': 1, 'bois': 1, 'lune': 1, 'genseng': 1, 'tresse': 1, 'dos': 1, 'parfumée': 1, 'huile': 1, 'camélia': 1, 'sous': 1, 'front': 1, 'obliques': 1, 'rangées': 1, 'comme': 1, 'clavier': 1, 'blanc': 1, 'pratique': 1, 'sourit': 1, 'montrant': 1, 'dents': 1, 'tandis': 1, 'doigts': 1, 'ongles': 1, 'longs': 1, 'plongent': 1, 'coffrets': 1, 'laque': 1, 'peints': 1, 'dragons': 1, 'serpents': 1, 'enroulés': 1, 'traquent': 1, 'tirer': 1, 'péko': 1, 'souchong': 1, 'hangkai': 1, 'encor': 1, 'hysong': 1, 'selon': 1, 'vert': 1, 'noir': 1, 'agrée': 1, 'client': 1, 'avoir': 1, 'long': 1, 'kimono': 1, 'bleu': 1, 'femme': 1, 'khôl': 1, 'autour': 1, 'disent': 1, 'feu': 1, 'jettent': 1, 'flammes': 1, 'soir': 1, 'ceux': 1, 'navires': 1, 'viennent': 1, 'prendre': 1, 'place': 1, 'tables': 1, 'boire': 1, 'dé

In [24]:

Count_total_6 = Counter(texts_data[6])

IndexError: list index out of range

In [25]:

Count_total_5 = Counter(texts_data[5])

In [26]:

print(Count_total_5)

Counter({'plus': 8, 'comme': 7, 'sans': 6, 'faim': 5, 'dit': 5, 'non': 4, 'temps': 4, 'toujours': 4, 'tout': 4, 'fait': 4, 'rien': 4, 'chaque': 4, 'dire': 3, 'institutrice': 3, 'après': 3, 'dont': 3, 'ora': 3, 'labora': 3, 'pascal': 3, 'entre': 3, 'jour': 3, 'vent': 3, 'peu': 3, 'leurs': 3, 'gezelle': 3, 'dune': 3, 'quil': 3, 'celui': 3, 'joie': 3, 'jai': 3, 'lextase': 3, 'fois': 3, 'bel': 2, 'âge': 2, 'jours': 2, 'papier': 2, 'pieds': 2, 'dénude': 2, 'tête': 2, 'poète': 2, 'nul': 2, 'disait': 2, 'lêtre': 2, 'trace': 2, 'front': 2, 'cest': 2, 'faut': 2, 'jusqu': 2, 'nen': 2, 'bonnes': 2, 'beaucoup': 2, 'donc': 2, 'sœurs': 2, 'dieu': 2, 'mauvais': 2, 'jamais': 2, 'loin': 2, 'écrit': 2, 'seigneur': 2, 'livres': 2, 'champs': 2, 'terre': 2, 'toute': 2, 'grand': 2, 'jaimais': 2, 'quen': 2, 'langue': 2, 'doux': 2, 'parler': 2, 'clémence': 2, 'vrai': 2, 'charrue': 2, 'aussi': 2, 'javance': 2, 'main': 2, 'feu': 2, 'quatorze': 2, 'visage': 2, 'tel': 2, 'passe': 2, 'cent': 2, 'souffert': 2, 'cor