In [13]:
%pip install hmmlearn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import os
from hmmlearn import hmm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import random

In [15]:
def load_annotated_sequences_from_paths(filepaths):
    sequences = []
    for filepath in filepaths:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            sequence = []
            for line in f:
                tokens = line.strip().split()
                sequence.extend(tokens[::-1])
            sequences.append(sequence)
    return sequences

In [16]:
# Apenas para obter os ficheiros .il de uma pasta (path relativo)
def get_il_files(pasta):
    return [os.path.join(pasta, f) for f in os.listdir(pasta) if f.endswith(".il") and os.path.isfile(os.path.join(pasta, f))]

In [17]:
# Das pastas que tem todos os .il
#files_vuln = get_il_files("PHP2IL/data/similarity_data/vuln")
#files_nao_vuln = get_il_files("PHP2IL/data/similarity_data/nvuln")

# Das pastas divididas por similaridade
files_vuln = get_il_files("PHP2IL/hmm/vuln_grupo_alto")
files_nao_vuln = get_il_files("PHP2IL/hmm/nvuln_grupo_alto")

In [18]:
# Balancear os ficheiros vuln e nvuln
min_len = min(len(files_vuln), len(files_nao_vuln))
random.seed(42)
random.shuffle(files_vuln)
random.shuffle(files_nao_vuln)

files_vuln = files_vuln[:min_len]
files_nao_vuln = files_nao_vuln[:min_len]

In [19]:
# Dividir 80/20 para treino/teste
train_vuln, test_vuln = train_test_split(files_vuln, test_size=0.2, random_state=42)
train_nvuln, test_nvuln = train_test_split(files_nao_vuln, test_size=0.2, random_state=42)

In [20]:
# Ler os ficheiros e obter todos os tokens
train_seqs_vuln = load_annotated_sequences_from_paths(train_vuln)
train_seqs_nvuln = load_annotated_sequences_from_paths(train_nvuln)

all_sequences = train_seqs_vuln + train_seqs_nvuln
all_tokens = [token for seq in all_sequences for token in seq]

In [21]:
# Encoder porque o modelo apenas suporta números
encoder = LabelEncoder()
encoder.fit(all_tokens)

def encode_sequences(sequences):
    return [encoder.transform(seq).reshape(-1, 1) for seq in sequences]

X_vuln = encode_sequences(train_seqs_vuln)
X_nao_vuln = encode_sequences(train_seqs_nvuln)

In [22]:

def treinar_hmm(sequences, n_states=2, alpha=0.01): # alpha is the smoothing parameter
    X_concat = np.concatenate(sequences)
    lengths = [len(seq) for seq in sequences]

    model = hmm.CategoricalHMM(n_components=n_states, n_iter=100, random_state=42)
    model.fit(X_concat, lengths)

    
    n_observations = len(encoder.classes_)
    model.emissionprob_ = (model.emissionprob_ + alpha) / (1.0 + alpha * n_observations)

    # model.transmat_ = (model.transmat_ + alpha) / (1.0 + alpha * model.n_components)

    return model


modelo_vuln = treinar_hmm(X_vuln, alpha=0.01)
modelo_nao_vuln = treinar_hmm(X_nao_vuln, alpha=0.01)

def prever_sql_injection(filepath):
    with open(filepath, 'r') as f:
        tokens = [tok for line in f for tok in line.strip().split()][::-1]
    seq = encoder.transform(tokens).reshape(-1, 1)

    score_vuln = modelo_vuln.score(seq)
    score_nao_vuln = modelo_nao_vuln.score(seq)

    # Convert log-likelihoods to probabilities
    prob_vuln = np.exp(score_vuln)
    prob_nao_vuln = np.exp(score_nao_vuln)

    print(f"Log-likelihoods: Non-Vulnerable={score_nao_vuln}, Vulnerable={score_vuln}")
    print(f"Probabilities: Non-Vulnerable={prob_nao_vuln}, Vulnerable={prob_vuln}")

    return "vulnerável" if score_vuln > score_nao_vuln else "não vulnerável"


In [23]:
y_true = ['vulnerável'] * len(test_vuln) + ['não vulnerável'] * len(test_nvuln)
y_pred = [prever_sql_injection(f) for f in test_vuln + test_nvuln]

print("Classification Report:\n")
print(classification_report(y_true, y_pred, digits=5))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Log-likelihoods: Non-Vulnerable=-46.13109605759768, Vulnerable=-44.98984305742532
Probabilities: Non-Vulnerable=9.236757418013681e-21, Vulnerable=2.8917411722590035e-20
Log-likelihoods: Non-Vulnerable=-45.84731305090672, Vulnerable=-44.751123937924326
Probabilities: Non-Vulnerable=1.226775041886554e-20, Vulnerable=3.6714178476079e-20
Log-likelihoods: Non-Vulnerable=-46.587731879711555, Vulnerable=-45.41789336990819
Probabilities: Non-Vulnerable=5.850663497353433e-21, Vulnerable=1.8847750746226723e-20
Log-likelihoods: Non-Vulnerable=-49.988243143835355, Vulnerable=-48.75941641370274
Probabilities: Non-Vulnerable=1.9515597058753997e-22, Vulnerable=6.668904690774915e-22
Log-likelihoods: Non-Vulnerable=-58.10005886896908, Vulnerable=-56.62769722350532
Probabilities: Non-Vulnerable=5.854166025344232e-26, Vulnerable=2.5521345854717907e-25
Log-likelihoods: Non-Vulnerable=-58.10005886896908, Vulnerable=-56.62769722350532
Probabilities: Non-Vulnerable=5.854166025344232e-26, Vulnerable=2.5521345

In [24]:
# Matrizes do modelo vulneravel
print("\nHMM Vulnerável Transition Matrix:")
print(modelo_vuln.transmat_)
print("\nHMM Vulnerável Emission Matrix:")
print(modelo_vuln.emissionprob_)
print("\nHMM Vulnerável Start Probabilities:")
print(modelo_vuln.startprob_)


HMM Vulnerável Transition Matrix:
[[9.99998076e-01 1.92440489e-06]
 [9.99999877e-01 1.22713627e-07]]

HMM Vulnerável Emission Matrix:
[[0.0100764  0.00793651 0.00793651 0.0414026  0.00793651 0.07486869
  0.01027135 0.01143877 0.01027135 0.00793651 0.01027135 0.0414026
  0.0414026  0.0414026  0.0414026  0.0414026  0.00910393 0.0414026
  0.01027135 0.00793651 0.0414026  0.00793651 0.01474647 0.01182791
  0.01066049 0.43735207]
 [0.10022637 0.17404218 0.09560339 0.00793651 0.05869102 0.00793651
  0.00793651 0.00793651 0.00793651 0.06330507 0.00793651 0.00793651
  0.00793651 0.00793651 0.00793651 0.00793651 0.00793651 0.00793651
  0.00793651 0.06330507 0.00793651 0.00793651 0.00793651 0.00793651
  0.00793651 0.29403325]]

HMM Vulnerável Start Probabilities:
[5.84421177e-09 9.99999994e-01]


In [25]:
# Matrizes do modelo não vulneravel
print("\nHMM Não Vulnerável Transition Matrix:")
print(modelo_nao_vuln.transmat_)
print("\nHMM Não Vulnerável Emission Matrix:")
print(modelo_nao_vuln.emissionprob_)
print("\nHMM Não Vulnerável Start Probabilities:")
print(modelo_nao_vuln.startprob_)


HMM Não Vulnerável Transition Matrix:
[[9.99999593e-01 4.07215050e-07]
 [9.99999953e-01 4.66320487e-08]]

HMM Não Vulnerável Emission Matrix:
[[0.01054085 0.00793651 0.00793651 0.03780043 0.00793651 0.06766435
  0.01036729 0.01088817 0.01036729 0.00793651 0.00880465 0.03780043
  0.03780043 0.03780043 0.03780043 0.03780043 0.03780043 0.03780043
  0.00880465 0.00793651 0.03780043 0.00793651 0.02825092 0.01158268
  0.01088817 0.43401805]
 [0.0771514  0.16481933 0.09099212 0.00793651 0.07714952 0.00793651
  0.00793651 0.00793651 0.00793651 0.07253532 0.00793651 0.00793651
  0.00793651 0.00793651 0.00793651 0.00793651 0.00793651 0.00793651
  0.00793651 0.03100751 0.00793651 0.08637792 0.00793651 0.00793651
  0.00793651 0.25710975]]

HMM Não Vulnerável Start Probabilities:
[6.84336049e-09 9.99999993e-01]
