In [1]:
#extraction adjectifs dépendants de pain(s)

In [2]:
ep1 = "/home/port-pret-etu01/Documents/LATTICE/conllu/1810-1840.conllu"
ep2 = "/home/port-pret-etu01/Documents/LATTICE/conllu/1841-1913.conllu"
ep3 = "/home/port-pret-etu01/Documents/LATTICE/conllu/1914-2009.conllu"

docs = [ep1, ep2, ep3]



In [3]:
def extract_dependents(file_path, output_all_path, output_adj_path, target_lemmas):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    sentence = []
    for line in lines:
        line = line.strip()
        if line == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        elif not line.startswith("#"):
            sentence.append(line)
    if sentence:
        sentences.append(sentence)

    all_results = []
    adj_results = []

    for sent in sentences:
        token_list = []
        for line in sent:
            parts = line.split('\t')
            if len(parts) < 8 or '-' in parts[0] or '.' in parts[0]:
                continue
            token_list.append({
                "index": int(parts[0]),
                "word": parts[1],
                "lemma": parts[2].lower(),
                "upos": parts[3],  # Universal POS
                "xpos": parts[4],  # Language-specific POS (ex. JJ)
                "head": int(parts[6])
            })

        target_tokens = [t for t in token_list if t["lemma"] in target_lemmas]
        if not target_tokens:
            continue

        target_indices = {t["index"] for t in target_tokens}

        dependents = [
            t for t in token_list
            if t["head"] in target_indices and t["lemma"] not in target_lemmas
        ]

        if dependents:
            sorted_deps = sorted(dependents, key=lambda x: x["index"])
            for t in sorted_deps:
                word_pos = f"{t['word']}\t{t['upos']}/{t['xpos']}"
                all_results.append(word_pos)
                if t["upos"] == "ADJ" or t["xpos"] == "JJ":
                    adj_results.append(word_pos)
                    
    with open(output_all_path, 'w', encoding='utf-8') as f_all:
        for line in all_results:
            f_all.write(line + "\n")

    with open(output_adj_path, 'w', encoding='utf-8') as f_adj:
        for line in adj_results:
            f_adj.write(line + "\n")

    print(f"dep saved under: {output_all_path}")
    print(f"adj saved under: {output_adj_path}")


In [4]:
output_dir = "/home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s"

for i, file_path in enumerate(docs, start=1):
    output_all_path = f"{output_dir}/alle_dependenten_ep{i}.txt"
    output_adj_path = f"{output_dir}/nur_adjektive_ep{i}.txt"

    print(f"doc: {file_path}")
    extract_dependents(
        file_path=file_path,
        output_all_path=output_all_path,
        output_adj_path=output_adj_path,
        target_lemmas={"pain", "pains"}
    )




doc: /home/port-pret-etu01/Documents/LATTICE/conllu/1810-1840.conllu
dep saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/alle_dependenten_ep1.txt
adj saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/nur_adjektive_ep1.txt
doc: /home/port-pret-etu01/Documents/LATTICE/conllu/1841-1913.conllu
dep saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/alle_dependenten_ep2.txt
adj saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/nur_adjektive_ep2.txt
doc: /home/port-pret-etu01/Documents/LATTICE/conllu/1914-2009.conllu
dep saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/alle_dependenten_ep3.txt
adj saved under: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/nur_adjektive_ep3.txt


In [5]:
#freqdcits pour adjectifs

In [6]:
from collections import Counter

output_dir = "/home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s"

adj_files = [
    f"{output_dir}/nur_adjektive_ep1.txt",
    f"{output_dir}/nur_adjektive_ep2.txt",
    f"{output_dir}/nur_adjektive_ep3.txt"
]

result_file = f"{output_dir}/adjectifs_freqdict.txt"

with open(result_file, 'w', encoding='utf-8') as out_f:

    for i, adj_file in enumerate(adj_files, start=1):
        with open(adj_file, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines()

 
        adjectives = [adj for adj in lines if adj != '-']

        total_adj = len(adjectives)
        freq = Counter(adjectives)

        out_f.write(f"=== epoque {i} ===\n")
        out_f.write(f"TOTAL adjectifs: {total_adj}\n\n")

        out_f.write("top 10 adjectifs (adjectif | fréquence | pourcentage)\n")
        out_f.write("-" * 60 + "\n")
        top_10 = freq.most_common(10)
        for adj, count in top_10:
            percent = (count / total_adj * 100) if total_adj > 0 else 0
            out_f.write(f"{adj:20} | {count:7} | {percent:6.2f}%\n")
        out_f.write("\n")

        out_f.write("tous les adjectifs:\n")
        out_f.write("-" * 35 + "\n")
        for adj, count in freq.most_common():
            out_f.write(f"{adj:20} : {count}\n")
        out_f.write("\n\n")

print(f"Saved under: {result_file}")


Auswertung gespeichert in: /home/port-pret-etu01/Documents/LATTICE/dependencies/dep (what is dep of pain)/adjectifs_s/adjectifs_freqdict.txt
