In [1]:
import json
import re
from pathlib import Path
from io import StringIO
from multiprocessing import Pool, cpu_count

import pandas as pd
from tqdm.auto import tqdm

import spacy

subfolder: Path

nlp: spacy.language = spacy.load("fr_core_news_sm")
columns: list[str] = ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]
ud_dir: Path = Path("UD")
exports_dir: Path = Path("exports")
exports_extended_dir: Path = Path("Xports")
exports_extended_dir.mkdir(exist_ok=True, parents=True)

In [2]:
WAC_dir = ud_dir / "WAC"

with StringIO() as all_txt:
    for connlu in WAC_dir.glob("*.conllu"):
        # print(connlu)
        with open(connlu, "r", encoding="utf-8") as f:
            all_txt.write(f.read())

    sents = tuple(s.split("# sent_id = ")[-1] for s in all_txt.getvalue().split("\n\n") if s != "")

    all_txt.close()


In [6]:
sent = sents[0]
sent

'300000\n# text = Une seule chose pourra toutefois encore être modifiée, la rencontre pourrait commencer à 19h30", affirmait ainsi le patron du Slavia, Vladimir Leska, en décembre 2003.\n1\tUne\tun\tDET\tDET\tDefinite=Ind|Gender=Fem|Number=Sing|PronType=Art\t3\tdet\t3:det\t_\n2\tseule\tseul\tADJ\tADJ\tGender=Fem|Number=Sing\t3\tamod\t3:amod\t_\n3\tchose\tchose\tNOUN\tNOUN\tGender=Fem|Number=Sing\t4\tnsubj\t4:nsubj\t_\n4\tpourra\tpouvoir\tVERB\tVERB\tMood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin\t0\troot\t0:root\t_\n5\ttoutefois\ttoutefois\tADV\tADV\t_\t4\tadvmod\t4:advmod\t_\n6\tencore\tencore\tADV\tADV\t_\t4\tadvmod\t4:advmod\t_\n7\têtre\têtre\tAUX\tAUX\tVerbForm=Inf\t8\taux:pass\t8:aux:pass\t_\n8\tmodifiée\tmodifier\tVERB\tVERB\tGender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass\t4\txcomp\t4:xcomp\tSpaceAfter=No\n9\t,\t,\tPUNCT\tPUNCT\t_\t4\tpunct\t4:punct\t_\n10\tla\tle\tDET\tDET\tDefinite=Def|Gender=Fem|Number=Sing|PronType=Art\t11\tdet\t11:det\t_\n11\trencontre\tren

In [19]:
class ConnluLine:
    columns: tuple[str] = ("ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC")
    len_col = len(columns)
    
    def __init__(self, line: str):
        self.ID, self.FORM, self.LEMMA, self.UPOS, self.XPOS, self.FEATS, self.HEAD, self.DEPREL, self.DEPS, self.MISC = line.split("\t")
        
    def __repr__(self):
        return f"""ConnluLine({" ".join(f"{col}: {getattr(self, col)}" for col in self.columns)})"""
    
    def __str__(self):
        return "\t".join(getattr(self, col) for col in self.columns)
    
    def __eq__(self, other):
        return all(getattr(self, col) == getattr(other, col) for col in self.columns)
    
    def __hash__(self):
        return hash(str(self))
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return getattr(self, self.columns[key])
        elif isinstance(key, str):
            return getattr(self, key)
        else:
            raise TypeError(f"ConnluLine indices must be integers or strings, not {type(key)}")
        
    def __setitem__(self, key, value):
        if isinstance(key, int):
            setattr(self, self.columns[key], value)
        elif isinstance(key, str):
            setattr(self, key, value)
        else:
            raise TypeError(f"ConnluLine indices must be integers or strings, not {type(key)}")
        
    def __iter__(self):
        for col in self.columns:
            yield getattr(self, col)
            
    def __len__(self):
        return self.len_col
    

class ConnluSent:
    
    def __init__(self, string: str):
        decoupe = [line.strip() for line in string.split("\n") if line != "" and not line.startswith("#")]
        self.id = decoupe[0]
        self.lines = tuple(ConnluLine(line) for line in decoupe[1:])
        
    def __repr__(self):
        return f"""ConnluSent({self.id}, {self.lines})"""
    
    def __str__(self):
        return "\n".join((self.id, *map(str, self.lines)))
    
    def __eq__(self, other):
        return self.id == other.id and self.lines == other.lines
    
    def __hash__(self):
        return hash(str(self))
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return self.lines[key]
        elif isinstance(key, str):
            return tuple(line[key] for line in self.lines)
        else:
            raise TypeError(f"ConnluSent indices must be integers or strings, not {type(key)}")
        
        
cs = ConnluSent(sent)

forms = cs["FORM"]
print(forms)
        
    
    

('Une', 'seule', 'chose', 'pourra', 'toutefois', 'encore', 'être', 'modifiée', ',', 'la', 'rencontre', 'pourrait', 'commencer', 'à', '19h30', '"', ',', 'affirmait', 'ainsi', 'le', 'patron', 'du', 'Slavia', ',', 'Vladimir', 'Leska', ',', 'en', 'décembre', '2003', '.')


In [12]:
ConnluLine(sent.split("\n")[2])

ConnluLine(ID: 1 FORM: Une LEMMA: un UPOS: DET XPOS: DET FEATS: Definite=Ind|Gender=Fem|Number=Sing|PronType=Art HEAD: 3 DEPREL: det DEPS: 3:det MISC: _)

In [13]:
sent.split("\n")[2]

'1\tUne\tun\tDET\tDET\tDefinite=Ind|Gender=Fem|Number=Sing|PronType=Art\t3\tdet\t3:det\t_'