In [1]:
import json
import re
from pathlib import Path
from io import StringIO
from multiprocessing import Pool, cpu_count

import pandas as pd
from tqdm.auto import tqdm

import spacy

subfolder: Path

nlp: spacy.language = spacy.load("fr_core_news_sm")
columns: list[str] = ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]
ud_dir: Path = Path("UD")
exports_dir: Path = Path("exports")
exports_extended_dir: Path = Path("Xports")
exports_extended_dir.mkdir(exist_ok=True, parents=True)

In [2]:
WAC_dir = ud_dir / "WAC"

with StringIO() as all_txt:
    for connlu in WAC_dir.glob("*.conllu"):
        # print(connlu)
        with open(connlu, "r", encoding="utf-8") as f:
            all_txt.write(f.read())

    sents = tuple(s.split("# sent_id = ")[-1] for s in all_txt.getvalue().split("\n\n") if s != "")

    all_txt.close()


In [3]:
sent = sents[0]
sent

"120000\n# text = Au moment d'écrire ces lignes, M. Soucy mentionnait au Pharillon qu'il était certain que le mouvement de sol allait se poursuivre à cet endroit.\n1\tAu\tau\tADP\tADP\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tcase\t2:case\t_\n2\tmoment\tmoment\tNOUN\tNOUN\tGender=Masc|Number=Sing\t4\tobl:mod\t4:obl:mod\t_\n3\td'\tde\tADP\tADP\t_\t4\tmark\t4:mark\tSpaceAfter=No\n4\técrire\técrire\tVERB\tVERB\tVerbForm=Inf\t10\tadvcl\t10:advcl\t_\n5\tces\tce\tDET\tDET\tNumber=Plur|PronType=Dem\t6\tdet\t6:det\t_\n6\tlignes\tligne\tNOUN\tNOUN\tGender=Fem|Number=Plur\t4\tobj\t4:obj\tSpaceAfter=No\n7\t,\t,\tPUNCT\tPUNCT\t_\t10\tpunct\t10:punct\t_\n8\tM.\tm.\tNOUN\tNOUN\tGender=Masc|Number=Sing\t10\tnsubj\t10:nsubj\t_\n9\tSoucy\tSoucy\tPROPN\tPROPN\t_\t8\tflat:name\t8:flat:name\t_\n10\tmentionnait\tmentionner\tVERB\tVERB\tMood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin\t0\troot\t0:root\t_\n11\tau\tau\tADP\tADP\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t12\tcase\t12:

In [4]:
class ConnluLine:
    columns: tuple[str] = ("ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC")
    len_col = len(columns)
    
    def __init__(self, line: str):
        self.ID, self.FORM, self.LEMMA, self.UPOS, self.XPOS, self.FEATS, self.HEAD, self.DEPREL, self.DEPS, self.MISC = line.split("\t")
        
    def __repr__(self):
        return f"""ConnluLine({" ".join(f"{col}: {getattr(self, col)}" for col in self.columns)})"""
    
    def __str__(self):
        return "\t".join(getattr(self, col) for col in self.columns)
    
    def __eq__(self, other):
        return all(getattr(self, col) == getattr(other, col) for col in self.columns)
    
    def __hash__(self):
        return hash(str(self))
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return getattr(self, self.columns[key])
        elif isinstance(key, str):
            return getattr(self, key)
        else:
            raise TypeError(f"ConnluLine indices must be integers or strings, not {type(key)}")
        
    def __setitem__(self, key, value):
        if isinstance(key, int):
            setattr(self, self.columns[key], value)
        elif isinstance(key, str):
            setattr(self, key, value)
        else:
            raise TypeError(f"ConnluLine indices must be integers or strings, not {type(key)}")
        
    def __iter__(self):
        for col in self.columns:
            yield getattr(self, col)
            
    def __len__(self):
        return self.len_col
    

class ConnluSent:
    
    def __init__(self, string: str):
        decoupe = [line.strip() for line in string.split("\n") if line != "" and not line.startswith("#")]
        self.id = decoupe[0]
        self.lines = tuple(ConnluLine(line) for line in decoupe[1:])
        
    def __repr__(self):
        return f"""ConnluSent({self.id}, {self.lines})"""
    
    def __str__(self):
        return "\n".join((self.id, *map(str, self.lines)))
    
    def __eq__(self, other):
        return self.id == other.id and self.lines == other.lines
    
    def __hash__(self):
        return hash(str(self))
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return self.lines[key]
        elif isinstance(key, str):
            return tuple(line[key] for line in self.lines)
        else:
            raise TypeError(f"ConnluSent indices must be integers or strings, not {type(key)}")
        
        
cs = ConnluSent(sent)

forms = cs["FORM"]
print(forms)
        
    
    

('Au', 'moment', "d'", 'écrire', 'ces', 'lignes', ',', 'M.', 'Soucy', 'mentionnait', 'au', 'Pharillon', "qu'", 'il', 'était', 'certain', 'que', 'le', 'mouvement', 'de', 'sol', 'allait', 'se', 'poursuivre', 'à', 'cet', 'endroit', '.')


In [5]:
ConnluLine(sent.split("\n")[2])

ConnluLine(ID: 1 FORM: Au LEMMA: au UPOS: ADP XPOS: ADP FEATS: Definite=Def|Gender=Masc|Number=Sing|PronType=Art HEAD: 2 DEPREL: case DEPS: 2:case MISC: _)

In [6]:
sent.split("\n")[2]

'1\tAu\tau\tADP\tADP\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tcase\t2:case\t_'

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv("/home/marceau/PycharmProjects/UNDERGREW/Xports/WAC/VERB.csv")


In [18]:
df[["FORM", "LEMMA", "pivot"]]

Unnamed: 0,FORM,LEMMA,pivot
0,marquait,marquer,marquait
1,critiqué,critiquer,critiqué
2,passerait,passer,passerait
3,appliquant,appliquer,appliquant
4,disposez,disposer,disposez
...,...,...,...
2041676,perdant,perdre,perdant
2041677,étoffé,étoffer,étoffé
2041678,renforcer,renforcer,renforcer
2041679,ouvrira,ouvrir,ouvrira


In [11]:
df.columns

Index(['sent_id', 'left_context', 'pivot', 'right_context', 'ID', 'FORM',
       'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC',
       'dist'],
      dtype='object')