In [68]:
import os, glob, re
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix, hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

TXT="TXT"; TTG="TTG"; FLEMM="FLEMM"; REL="RELH2-TP"; HDT="HDT-TP"; META="METADATA3"

valid=[]
for f in glob.glob(f"{TXT}/*.txt"):
    doc=os.path.basename(f).replace(".txt","")
    if doc!="termlist" and os.path.exists(f"{TTG}/{doc}.ttg") and os.path.exists(f"{META}/{doc}_md.txt"):
        valid.append(doc)

valid=sorted(valid)
print("Documents utilisés:",len(valid))


Documents utilisés: 285


In [69]:
raw=[" ".join(open(f"{TXT}/{d}.txt").read().split()) for d in valid]

bow = CountVectorizer(min_df=2, ngram_range=(1,2))
X_bow = bow.fit_transform(raw)
print("X_bow =",X_bow.shape)


X_bow = (285, 6528)


In [70]:
vocab_ttg={}; data=[]; idx=[]; ptr=[0]

for d in valid:
    df=pd.read_csv(f"{TTG}/{d}.ttg",sep="\t",header=None).astype(str)
    tokens=list(df[0])+list(df[2])  # forme + lemme
    for t in tokens:
        j=vocab_ttg.setdefault(t,len(vocab_ttg))
        idx.append(j); data.append(1)
    ptr.append(len(idx))

X_ttg = csr_matrix((data,idx,ptr))
print("X_ttg =",X_ttg.shape)


X_ttg = (285, 5717)


In [71]:
lemmas=[]
for d in valid:
    words=[]
    for line in open(f"{FLEMM}/{d}.flemm",encoding="utf-8"):
        cols=line.strip().split("\t")
        if len(cols)>=2: words.append(cols[1])
    lemmas.append(" ".join(words))

fv = CountVectorizer(min_df=2)
X_flemm = fv.fit_transform(lemmas)
print("X_flemm =",X_flemm.shape)


X_flemm = (285, 95)


In [72]:
X_rel=lil_matrix((len(valid),4000))
hv={};col=0

for i,d in enumerate(valid):
    f=f"{REL}/{d}.relH2"
    if os.path.exists(f):
        for line in open(f,encoding="utf-8"):
            if "\t" in line:
                h,y=line.split("\t")
                H=y.replace("/term","").strip()
                if H not in hv: hv[H]=col; col+=1
                X_rel[i,hv[H]]=1

X_rel=X_rel[:,:col].tocsr()
print("X_rel =",X_rel.shape)


X_rel = (285, 7)


In [73]:
X_hdt=lil_matrix((len(valid),50))
hdt_map={};cid=0

for i,d in enumerate(valid):
    f=f"{HDT}/{d}.hdt"    # adapte extension si .ann/.txt
    if os.path.exists(f):
        for line in open(f):
            if "TIMEX3" in line:
                if "TIME" not in hdt_map: hdt_map["TIME"]=cid;cid+=1
                X_hdt[i,hdt_map["TIME"]]=1
            if "DIST" in line:
                if "DIST" not in hdt_map: hdt_map["DIST"]=cid;cid+=1
                X_hdt[i,hdt_map["DIST"]]=1
            if "SPEED" in line:
                if "SPEED" not in hdt_map: hdt_map["SPEED"]=cid;cid+=1
                X_hdt[i,hdt_map["SPEED"]]=1

X_hdt=X_hdt[:,:cid].tocsr()
print("X_hdt =",X_hdt.shape)


X_hdt = (285, 1)


In [74]:
ani=[];dist=[];speed=[]

r_animal  = r"(rapace|aigle|chauve[- ]?souris|goéland|oiseau|mouette|corbeau)"
r_dist    = r"\b\d{1,4} ?m\b"
r_speed   = r"\b\d{1,3} ?(km/h|m/s)\b"

for d in valid:
    txt=open(f"{TXT}/{d}.txt").read().lower()
    ani.append(int(bool(re.search(r_animal,txt))))
    dist.append(int(bool(re.search(r_dist,txt))))
    speed.append(int(bool(re.search(r_speed,txt))))

X_ani = csr_matrix(np.vstack([ani,dist,speed]).T)
print("X_ani =",X_ani.shape)


X_ani = (285, 3)


In [77]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(sublinear_tf=True)
X_bow_tfidf   = tfidf.fit_transform(X_bow)
X_flemm_tfidf = tfidf.fit_transform(X_flemm)
X_ttg_tfidf   = tfidf.fit_transform(X_ttg)


In [79]:
from sklearn.decomposition import TruncatedSVD

def svd_reduce(X, n=200):
    n_comp = min(n, X.shape[1]-1)   # <-- sécurité automatique
    return TruncatedSVD(n_components=n_comp, random_state=0).fit_transform(X)

X_svd_bow   = svd_reduce(X_bow_tfidf,   n=250)   # gros → 250 OK
X_svd_flemm = svd_reduce(X_flemm_tfidf, n=80)    # 95 → mettra 94
X_svd_ttg   = svd_reduce(X_ttg_tfidf,   n=120)


In [80]:
X_boost = np.hstack([
    X_svd_bow,
    X_svd_flemm,
    X_svd_ttg,
    X_rel.toarray(),
    X_hdt.toarray(),
    X_ani.toarray()
])

print("X_boost:",X_boost.shape)


X_boost: (285, 461)


In [81]:
cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

def eval(model,name):
    pred=cross_val_predict(model,X_boost,y,cv=cv)
    print(f"\n======== {name} ========")
    print(classification_report(y,pred,digits=3))
    print("Macro-F =",precision_recall_fscore_support(y,pred,average='macro')[2])

eval(LinearSVC(class_weight="balanced"),"SVM optimized")
eval(RandomForestClassifier(n_estimators=500,class_weight="balanced"),"RF optimized")





                                                                               precision    recall  f1-score   support

                                               CONSÉQUENCES ENVIRONNEMENTALES      0.972     0.955     0.963       110
                                                     CONSÉQUENCES ÉCONOMIQUES      0.516     0.653     0.576        75
                      CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES ENVIRONNEMENTALES      0.333     0.231     0.273        13
                               CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES      0.462     0.462     0.462        39
CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES,CONSÉQUENCES ENVIRONNEMENTALES      0.000     0.000     0.000         6
                                                                     Inconnue      0.412     0.333     0.368        42

                                                                     accuracy                          0.663       285
                                             

  _warn_prf(average, modifier, msg_start, len(result))



                                                                               precision    recall  f1-score   support

                                               CONSÉQUENCES ENVIRONNEMENTALES      0.972     0.964     0.968       110
                                                     CONSÉQUENCES ÉCONOMIQUES      0.437     0.973     0.603        75
                      CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES ENVIRONNEMENTALES      0.000     0.000     0.000        13
                               CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES      0.556     0.128     0.208        39
CONSÉQUENCES ÉCONOMIQUES,CONSÉQUENCES SOCIALES,CONSÉQUENCES ENVIRONNEMENTALES      0.000     0.000     0.000         6
                                                                     Inconnue      0.000     0.000     0.000        42

                                                                     accuracy                          0.646       285
                                             

  _warn_prf(average, modifier, msg_start, len(result))
