In [1]:
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def carregar_ficheiros_MEMM(pasta):
    sequencias = []
    for nome_fich in os.listdir(pasta):
        caminho = os.path.join(pasta, nome_fich)
        with open(caminho, 'r') as f:
            seq = []
            for linha in f:
                linha = linha.strip()
                if not linha:
                    continue
                tokens = linha.split()
                for token in tokens[::-1]:
                    if "::" not in token:
                        continue
                    observacao, estado = token.split("::")
                    seq.append((observacao, estado))
        if seq:
            sequencias.append(seq)
    return sequencias

In [3]:
def extrair_features_MEMM(sequencias):
    X = []
    y = []
    for seq in sequencias:
        estado_anterior = "START"
        for observacao, estado in seq:
            features = {
                'observacao': observacao,
                'estado_anterior': estado_anterior
            }
            X.append(features)
            y.append(estado)
            estado_anterior = estado
    return X, y

In [4]:
pasta_dados = "PHP2IL/hmm/vuln_grupo_alto_last_ANN"  

sequencias = carregar_ficheiros_MEMM(pasta_dados)

X, y = extrair_features_MEMM(sequencias)

pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LogisticRegression(max_iter=200))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      ntaint       0.76      0.42      0.54        31
         san       1.00      1.00      1.00        17
       taint       0.92      0.54      0.68       501
         und       0.82      0.98      0.89      1120

    accuracy                           0.84      1669
   macro avg       0.87      0.73      0.78      1669
weighted avg       0.85      0.84      0.82      1669

