In [3]:
import pefile
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np


In [None]:
def extract_features(file_path):
    try:
        pe = pefile.PE(file_path)
        features = {
            "Machine": pe.FILE_HEADER.Machine,
            "SizeOfOptionalHeader": pe.FILE_HEADER.SizeOfOptionalHeader,
            "Characteristics": pe.FILE_HEADER.Characteristics,
            "Magic": pe.OPTIONAL_HEADER.Magic,
            "MajorLinkerVersion": pe.OPTIONAL_HEADER.MajorLinkerVersion,
            "MinorLinkerVersion": pe.OPTIONAL_HEADER.MinorLinkerVersion,
            "SizeOfCode": pe.OPTIONAL_HEADER.SizeOfCode,
            "SizeOfInitializedData": pe.OPTIONAL_HEADER.SizeOfInitializedData,
            "SizeOfUninitializedData": pe.OPTIONAL_HEADER.SizeOfUninitializedData,
            "AddressOfEntryPoint": pe.OPTIONAL_HEADER.AddressOfEntryPoint,
            "BaseOfCode": pe.OPTIONAL_HEADER.BaseOfCode,
            "BaseOfData": pe.OPTIONAL_HEADER.BaseOfData,
            "ImageBase": pe.OPTIONAL_HEADER.ImageBase,
            "SectionAlignment": pe.OPTIONAL_HEADER.SectionAlignment,
            "FileAlignment": pe.OPTIONAL_HEADER.FileAlignment,
            "MajorOperatingSystemVersion": pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
            "MinorOperatingSystemVersion": pe.OPTIONAL_HEADER.MinorOperatingSystemVersion,
            "MajorImageVersion": pe.OPTIONAL_HEADER.MajorImageVersion,
            "MinorImageVersion": pe.OPTIONAL_HEADER.MinorImageVersion,
            "MajorSubsystemVersion": pe.OPTIONAL_HEADER.MajorSubsystemVersion,
            "MinorSubsystemVersion": pe.OPTIONAL_HEADER.MinorSubsystemVersion,
            "SizeOfImage": pe.OPTIONAL_HEADER.SizeOfImage,
            "SizeOfHeaders": pe.OPTIONAL_HEADER.SizeOfHeaders,
            "CheckSum": pe.OPTIONAL_HEADER.CheckSum,
            "Subsystem": pe.OPTIONAL_HEADER.Subsystem,
            "DllCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            "SizeOfStackReserve": pe.OPTIONAL_HEADER.SizeOfStackReserve,
            "SizeOfStackCommit": pe.OPTIONAL_HEADER.SizeOfStackCommit,
            "SizeOfHeapReserve": pe.OPTIONAL_HEADER.SizeOfHeapReserve,
            "SizeOfHeapCommit": pe.OPTIONAL_HEADER.SizeOfHeapCommit,
            "LoaderFlags": pe.OPTIONAL_HEADER.LoaderFlags,
            "NumberOfRvaAndSizes": pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
        }
        return features
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

In [None]:
data = []
for file in os.listdir('MALWARE'):
    file_path = os.path.join('MALWARE', file)
    if os.path.isfile(file_path) and file_path.endswith(('.exe', '.dll')):
        features = extract_features(file_path)
        if features:
            data.append(features)
df = pd.DataFrame(data)

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df.select_dtypes(include=[np.number]))  # Asegúrate de seleccionar solo columnas numéricas

In [None]:
inertias = []
silhouettes = []

for k in range(2, 11):  # Prueba un rango de valores de k
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(data_scaled, kmeans.labels_))

# Gráfica del método del codo
plt.figure(figsize=(8, 4))
plt.plot(range(2, 11), inertias, 'bo-')
plt.xlabel('Número de clusters, k')
plt.ylabel('Inercia')
plt.title('Método del Codo')
plt.show()

# Gráfica del coeficiente de Silhouette
plt.figure(figsize=(8, 4))
plt.plot(range(2, 11), silhouettes, 'bo-')
plt.xlabel('Número de clusters, k')
plt.ylabel('Coeficiente de Silhouette')
plt.title('Coeficiente de Silhouette por k')
plt.show()

In [None]:
optimal_k = 5  # Asumiendo que 5 es el número óptimo basado en el análisis anterior
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(data_scaled)

df.head()