# PyMotifs

## Read all txt inside folder : 

In [5]:
import pandas as pd
import os 
import csv

In [6]:
# Loop over : https://stackoverflow.com/questions/69118811/how-to-read-all-txt-files-from-a-directory
def read_txts(path):
    """
    Read all txts inside a folder and put them into a dataframe.
    Folder must contain only txt files...
    """
    files_content = [] # create empty list to save content

    for filename in filter(lambda p: p.endswith("txt"), os.listdir(path)): # filtre les fichiers qui se terminent par txt
      # et liste les fichiers dans le path.
        filepath = os.path.join(path, filename)
        with open(filepath, mode='r') as f:
            files_content += [f.read()]

    print(f'There are {len(files_content)} texts in folder')
    all_files = os.listdir(path=path)
    df = pd.DataFrame()
    df['filename'] = all_files
    df['text'] = files_content
    return(df)

In [7]:
read_txts(path="/Users/adesacy/Desktop/PyMotifs/corpus")

There are 2 texts in folder


Unnamed: 0,filename,text
0,Huysmans.txt,À en juger par les quelques portraits conservé...
1,Balzac.txt,"Madame Vauquer, née de Conflans, est une vieil..."


In [8]:
# Save into object : 
df = read_txts(path="/Users/adesacy/Desktop/PyMotifs/corpus")

There are 2 texts in folder


In [9]:
df

Unnamed: 0,filename,text
0,Huysmans.txt,À en juger par les quelques portraits conservé...
1,Balzac.txt,"Madame Vauquer, née de Conflans, est une vieil..."


## Nettoyage :

In [10]:
# Change apostrophs : 

def clean_a_bit(df):
    """
    Function to clean differnt apostrophs and withdraw possible na
    values from df
    """
    df['text'] = df['text'].replace("’", "'")
    df['text'] = df['text'].replace("'", "'")
    # Retrait des NA dans la colonne mots : 
    df['text'] = df['text'].dropna(how = 'any', axis = 0)# Drop the row 
    
    return(df)

In [11]:
clean_a_bit(df)

Unnamed: 0,filename,text
0,Huysmans.txt,À en juger par les quelques portraits conservé...
1,Balzac.txt,"Madame Vauquer, née de Conflans, est une vieil..."


## Annotation : 

In [12]:
# !pip install spacy-udpipe

In [13]:
import spacy_udpipe

spacy_udpipe.download("fr") # Download french model : 

nlp = spacy_udpipe.load("fr")

Already downloaded a model for the 'fr' language


In [14]:
# Create a Tokenizer with the default settings for French
# including punctuation rules and exceptions

tokenizer = nlp.tokenizer

### Tokénisation

In [15]:
# Création d'une nouvelle dataframe pour accueillir les données de l'étiquetage : 
# On ne veut pas garder le texte intégral dans le nouveau tableau.

annotated_datas = pd.DataFrame()

# récupération de la colonne filename : 

annotated_datas['filename'] = df['filename']

In [16]:
# On tokenise les textes :

annotated_datas['words'] = df['text'].apply(lambda x: nlp.tokenizer(str(x)))
annotated_datas.head(10)

Unnamed: 0,filename,words
0,Huysmans.txt,"(À, en, juger, par, les, quelques, portraits, ..."
1,Balzac.txt,"(Madame, Vauquer, ,, née, de, Conflans, ,, est..."


In [17]:
# "Explostion" des données : un mot par ligne : 
annotated_datas = annotated_datas.explode("words", ignore_index=True)
annotated_datas.head(10)

Unnamed: 0,filename,words
0,Huysmans.txt,À
1,Huysmans.txt,en
2,Huysmans.txt,juger
3,Huysmans.txt,par
4,Huysmans.txt,les
5,Huysmans.txt,quelques
6,Huysmans.txt,portraits
7,Huysmans.txt,conservés
8,Huysmans.txt,à
9,Huysmans.txt,le


### Lemmatisation et étiquetage morphosyntaxique :

In [21]:
## Thx to Ed Rushton :
# Cf. https://stackoverflow.com/questions/44395656/applying-spacy-parser-to-pandas-dataframe-w-multiprocessing

## Spacy is highly optimised and does the multiprocessing for you. 
## As a result, I think your best bet is to take the data out of 
## the Dataframe and pass it to the Spacy pipeline as a list rather 
## than trying to use .apply directly.
## You then need to the collate the results of the parse, and put 
## this back into the Dataframe. 

lemma = []
pos = []
morph = []
dep = []

for doc in nlp.pipe(annotated_datas['words'].astype('unicode').values, batch_size=50):
    if doc.has_annotation:
        #tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        morph.append([n.morph for n in doc])
        dep.append([n.dep_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        # tokens.append(None)
        lemma.append(None)
        pos.append(None)
        morph.append(None)
        
# corpus_test['tokens'] = tokens
annotated_datas['lemma'] = lemma
annotated_datas['pos'] = pos
annotated_datas['morph'] = morph

In [22]:
# Explosion des données : 
annotated_datas = annotated_datas.explode("words", ignore_index=True)

In [23]:
annotated_datas

Unnamed: 0,filename,words,lemma,pos,morph,dep
0,Huysmans.txt,À,[à],[ADP],[()],[ROOT]
1,Huysmans.txt,en,[en],[ADP],[()],[ROOT]
2,Huysmans.txt,juger,[juger],[VERB],[(VerbForm=Inf)],[ROOT]
3,Huysmans.txt,par,[par],[ADP],[()],[ROOT]
4,Huysmans.txt,les,[le],[DET],"[(Definite=Def, Number=Plur, PronType=Art)]",[ROOT]
...,...,...,...,...,...,...
287,Balzac.txt,une,[un],[DET],"[(Definite=Ind, Gender=Fem, Number=Sing, PronT...",[ROOT]
288,Balzac.txt,bien,[bien],[ADV],[()],[ROOT]
289,Balzac.txt,maigre,[maigre],[ADJ],"[(Gender=Masc, Number=Sing)]",[ROOT]
290,Balzac.txt,pension,[pension],[NOUN],"[(Gender=Fem, Number=Sing)]",[ROOT]


In [27]:
with open('/Users/adesacy/Desktop/PyMotifs/output/corpus_annotated.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(annotated_datas)