In [3]:
import collections
from nltk import ngrams

file_to_analyze = "/content/drive/MyDrive/python-3.10.0-amd64.exe"

def read_file(file_path):
  """Reads in the binary sequence of a binary file."""
  with open(file_path, "rb") as binary_file:
    data = binary_file.read()
  return data

def byte_sequence_to_Ngrams(byte_sequence, N):
 """Creates a list of N-grams from a byte sequence."""
 Ngrams = ngrams(byte_sequence, N)
 return list(Ngrams)

def binary_file_to_Ngram_counts(file, N):
 """Takes a binary file and outputs the N-grams counts of its binary sequence."""
 filebyte_sequence = read_file(file)
 file_Ngrams = byte_sequence_to_Ngrams(filebyte_sequence, N)
 return collections.Counter(file_Ngrams)

extracted_Ngrams = binary_file_to_Ngram_counts(file_to_analyze, 4)
print(extracted_Ngrams.most_common(10))

[((0, 0, 0, 0), 24290), ((139, 240, 133, 246), 1920), ((32, 116, 111, 32), 1791), ((255, 255, 255, 255), 1671), ((108, 101, 100, 32), 1522), ((100, 32, 116, 111), 1519), ((97, 105, 108, 101), 1513), ((105, 108, 101, 100), 1513), ((70, 97, 105, 108), 1505), ((101, 100, 32, 116), 1503)]


In [14]:
from os import listdir
from os.path import isfile, join
directories = ["/content/drive/MyDrive/(Lab2)Dataset/Benign PE Samples", "/content/drive/MyDrive/(Lab2)Dataset/Malicious PE Samples"]
N = 2

Ngram_counts_all_files = collections.Counter([])
for dataset_path in directories:
 all_samples = [f for f in listdir(dataset_path) if
isfile(join(dataset_path, f))]
 for sample in all_samples:
  file_path = join(dataset_path, sample)
  Ngram_counts_all_files += binary_file_to_Ngram_counts(file_path, N)

K1 = 1000
K1_most_frequent_Ngrams = Ngram_counts_all_files.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

def featurize_sample(sample, K1_most_frequent_Ngrams_list):
 """Takes a sample and produces a feature vector. The features are the counts of the K1 N-grams we've selected."""
 K1 = len(K1_most_frequent_Ngrams_list)
 feature_vector = K1 * [0]
 file_Ngrams = binary_file_to_Ngram_counts(sample, N)
 for i in range(K1):
  feature_vector[i] = file_Ngrams[K1_most_frequent_Ngrams_list[i]]
 return feature_vector

directories_with_labels = [("/content/drive/MyDrive/(Lab2)Dataset/Benign PE Samples", 0), ("/content/drive/MyDrive/(Lab2)Dataset/Malicious PE Samples", 1)]
X = []
y = []
for dataset_path, label in directories_with_labels:
 all_samples = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f))]
 for sample in all_samples:
  file_path = join(dataset_path, sample)
  X.append(featurize_sample(file_path,
K1_most_frequent_Ngrams_list))
  y.append(label)

from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2
K2 = 10

import numpy as np
# Chọn N-grams phổ biến
X = np.array(X)
X_top_K2_freq = X[:,:K2]
print("N-grams phổ biến: ", X_top_K2_freq)

# Chọn N-grams có xếp hạng cao theo thuật toán mutual information
mi_selector = SelectKBest(mutual_info_classif, k=K2)
X_top_K2_mi = mi_selector.fit_transform(X, y)
print("N-grams có xếp hạng cao theo thuật toán mutual information: ", X_top_K2_mi)

# Chọn N-grams có xếp hạng cao theo thuật toán chi squared
chi2_selector = SelectKBest(chi2, k=K2)
X_top_K2_ch2 = chi2_selector.fit_transform(X, y)
print("N-grams có xếp hạng cao theo thuật toán chi squared: ", X_top_K2_ch2)

N-grams phổ biến:  [[  10935    4673       7     610      26       0      13     565     248
       14]
 [  15237    2604     866    1848    1332    1343     630      22      17
      630]
 [   4963     282      88     129     222     573     120       6       0
       67]
 [  36882    1921    6191    6527    2784       0    1435     413    2770
     1319]
 [ 140825   11831       7    1350      25       2       3     527     670
        8]
 [   7159    2219       3    1136      29       0       0     350     949
        5]
 [  18655    1518       3    1176     173       1       1     265    1621
        5]
 [  17320    1851      34     770      43       1       3     295     402
       21]
 [  71210    5507       2     259      10       1       1     210     225
        2]
 [  15222     699       0     186       9       1       0     202     101
        2]
 [  19249     788    2849    2681    2903       1    1366     319     548
      531]
 [  10478     469     732     338     479     