In [1]:
import os, spacy

nlp = spacy.load("fr_core_news_sm")
dossier = "CorpusARIA/TXT"

In [2]:
#etiquetage morpho syntaxique et lemmatisation

def analyse_fichier(path):
    text = open(path, encoding="utf-8").read()
    doc = nlp(text)
    infos = [(t.text, t.lemma_, t.pos_) for t in doc if not t.is_punct]
    return infos

if __name__ == "__main__":
    total_lemmes = set()
    fichiers = os.listdir(dossier)
    n_fichiers = len(fichiers)

    for f in fichiers:
        path = os.path.join(dossier, f)
        infos = analyse_fichier(path)
        lemmes = {lem.lower() for (_, lem, pos) in infos if lem.isalpha()}
        total_lemmes.update(lemmes)
        print(f"{f}: {len(lemmes)} lemmes distincts")

    print(f"\nTaille totale du vocabulaire (lemmes uniques) : {len(total_lemmes)}")


event-200.txt: 48 lemmes distincts
event-117.txt: 60 lemmes distincts
event-76.txt: 36 lemmes distincts
event-207.txt: 25 lemmes distincts
event-110.txt: 111 lemmes distincts
event-71.txt: 144 lemmes distincts
event-275.txt: 36 lemmes distincts
event-162.txt: 66 lemmes distincts
event-209.txt: 49 lemmes distincts
event-119.txt: 94 lemmes distincts
event-78.txt: 74 lemmes distincts
event-272.txt: 38 lemmes distincts
event-165.txt: 48 lemmes distincts
event-31.txt: 124 lemmes distincts
event-150.txt: 73 lemmes distincts
event-181.txt: 47 lemmes distincts
event-247.txt: 43 lemmes distincts
event-36.txt: 62 lemmes distincts
event-157.txt: 55 lemmes distincts
event-186.txt: 33 lemmes distincts
event-240.txt: 61 lemmes distincts
event-38.txt: 77 lemmes distincts
event-159.txt: 88 lemmes distincts
event-188.txt: 34 lemmes distincts
event-44.txt: 185 lemmes distincts
event-125.txt: 81 lemmes distincts
event-95.txt: 161 lemmes distincts
event-232.txt: 55 lemmes distincts
event-43.txt: 76 lemmes

event-170.txt: 50 lemmes distincts
event-267.txt: 55 lemmes distincts
event-16.txt: 125 lemmes distincts
event-177.txt: 73 lemmes distincts
event-260.txt: 51 lemmes distincts
event-64.txt: 161 lemmes distincts
event-105.txt: 51 lemmes distincts
event-212.txt: 35 lemmes distincts
event-18.txt: 94 lemmes distincts
event-179.txt: 65 lemmes distincts
event-269.txt: 42 lemmes distincts
event-63.txt: 125 lemmes distincts
event-102.txt: 78 lemmes distincts
event-215.txt: 66 lemmes distincts
event-278.txt: 56 lemmes distincts
event-204.txt: 44 lemmes distincts
event-72.txt: 98 lemmes distincts
event-113.txt: 72 lemmes distincts
event-203.txt: 56 lemmes distincts
event-75.txt: 31 lemmes distincts
event-114.txt: 56 lemmes distincts
event-168.txt: 50 lemmes distincts
event-271.txt: 33 lemmes distincts
event-166.txt: 46 lemmes distincts
event-276.txt: 41 lemmes distincts
event-161.txt: 57 lemmes distincts
event-154.txt: 67 lemmes distincts
event-35.txt: 203 lemmes distincts
event-243.txt: 86 lemme

In [3]:
#10 premiers mots analysés d'un fichier

path = os.path.join(dossier, fichiers[0])
for mot, lem, pos in analyse_fichier(path)[:10]:
    print(f"{mot:<15} → {lem:<15} ({pos})")


Impact          → impact          (NOUN)
de              → de              (ADP)
la              → le              (DET)
foudre          → foudre          (VERB)
sur             → sur             (ADP)
une             → un              (DET)
éolienne        → éolienne        (NOUN)

               → 
               (SPACE)
Dans            → dans            (ADP)
la              → le              (DET)


In [None]:
#extraction des lemmes pour chaque texte (pour preparer la classification)

texts_lemmatized = []
for f in fichiers:
    path = os.path.join(dossier, f)
    text = open(path, encoding="utf-8").read()
    doc = nlp(text)
    lemmes = [t.lemma_.lower() for t in doc if t.is_alpha]
    texts_lemmatized.append(" ".join(lemmes))

print(f"{len(texts_lemmatized)} textes lemmatisés.")


In [4]:
#on load les labels 

def lire_consequence_gros_grain(path_meta):
    with open(path_meta, encoding="utf-8") as f:
        ligne = f.readline().strip()
        champs = ligne.split(";")
        if len(champs) >= 16:
            return champs[15].strip()
        return None

labels = []
meta_dir = "CorpusARIA/METADATA3"

for meta in os.listdir(meta_dir):
    path_meta = os.path.join(meta_dir, meta)
    label = lire_consequence_gros_grain(path_meta)
    if label:
        labels.append(label)

print(f"{len(labels)} étiquettes chargées.")


285 étiquettes chargées.


In [6]:
dossier_flemm = "CorpusARIA/FLEMM"
texts_lemmatized = []

fichiers = sorted(os.listdir(dossier_flemm))  # même ordre que dans TXT

for f in fichiers:
    if f.endswith(".flemm"):
        path = os.path.join(dossier_flemm, f)
        with open(path, encoding="utf-8") as file:
            text = file.read()
            texts_lemmatized.append(text)

print(f"{len(texts_lemmatized)} textes lemmatisés chargés depuis Flemm.")


285 textes lemmatisés chargés depuis Flemm.


In [29]:
#vectorization

from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2), max_features=5000)
#vect = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = vectorizer.fit_transform(texts_lemmatized)


print("Dimensions :", X.shape)
print("Type :", type(X))
print("matrice creuse ?", sparse.issparse(X))



Dimensions : (285, 5000)
Type : <class 'scipy.sparse._csr.csr_matrix'>
matrice creuse ? True


In [30]:
# validation croisée 

from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(vectorizer, X, labels, cv=cv, scoring='f1_macro')
print("F1-macro (5-fold) : {:.2f} ± {:.2f}".format(scores.mean(), scores.std()))


F1-macro (5-fold) : nan ± nan


Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1165, in fit
    self.fit_transform(raw_documents)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1198, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents,
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1110, in _count_vocab
    for feature in analyze(doc):
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 104, in _analyze
    doc = preprocessor(doc)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 69, in _preprocess
    doc = doc.lower()
  File "/usr/lib/python3/dist-packages/scipy/sparse/_base.py", line 761, in __getattr__
    raise AttributeError(attr + " not found")


In [31]:
#classficiation

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


                                                                               precision    recall  f1-score   support

                                               CONSÉQUENCES ENVIRONNEMENTALES       0.36      0.30      0.33        27
                                                     CONSÉQUENCES ÉCONOMIQUES       0.18      0.29      0.22        14
                      CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES ENVIRONNEMENTALES       0.00      0.00      0.00         1
                               CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES       0.60      0.33      0.43         9
CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES,CONSÉQUENCES ENVIRONNEMENTALES       0.00      0.00      0.00         0
                                                                     Inconnue       0.00      0.00      0.00         6

                                                                     accuracy                           0.26        57
                                              

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
