In [1]:
pip install pandas numpy scikit-learn nltk hpelm


Collecting hpelm
  Downloading hpelm-1.0.10-py3-none-any.whl.metadata (2.9 kB)
Collecting fasteners (from hpelm)
  Downloading fasteners-0.19-py3-none-any.whl.metadata (4.9 kB)
Collecting nose (from hpelm)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading hpelm-1.0.10-py3-none-any.whl (50 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fasteners-0.19-py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, fasteners, hpelm
Successfully installed fasteners-0.19 hpelm-1.0.10 nose-1.3.7


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from hpelm import ELM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, csr_matrix

# Download NLTK data
nltk.download('punkt')

# Load the dataset
file_path = '/content/PDFMalware2022.csv'
df = pd.read_csv(file_path)

# Handle missing values
# Fill missing categorical values with the mode value of each column
categorical_cols = ['File name', 'text', 'header']
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Check for and handle missing values in the target column
if df['Class'].isnull().any():
    print("Found NaN values in the target column. Filling with mode value.")
    df['Class'] = df['Class'].fillna(df['Class'].mode()[0])

# Encode target variable
df['Class'] = df['Class'].replace({1: 1, 2: 2})

# Tokenization and Hashing Vectorizer for text columns
text_cols = ['File name', 'text', 'header']
hashing_vectorizer = HashingVectorizer(n_features=2**16, alternate_sign=False)

# Apply tokenization and vectorization
text_features = [hashing_vectorizer.transform(df[col].apply(lambda x: ' '.join(word_tokenize(x)))) for col in text_cols]

# Combine text features into a single sparse matrix
text_features_combined = hstack(text_features)

# Separate features and target
y = df['Class'].values

# Combine text features (no numerical features in this dataset)
X = text_features_combined

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

# Adjust labels for one-hot encoding
y_adjusted = y - 1

# One-hot encode the target labels
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_one_hot = one_hot_encoder.fit_transform(y_adjusted.reshape(-1, 1))

# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
classification_reports = []

for fold, (train_index, test_index) in enumerate(skf.split(X_reduced, y)):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y_one_hot[train_index], y_one_hot[test_index]

    # Print training and testing sizes
    print(f"Fold {fold + 1}:")
    print(f"Training size: {X_train.shape[0]}")
    print(f"Testing size: {X_test.shape[0]}")

    # Train an ELM model
    # Initialize ELM
    elm_model = ELM(X_train.shape[1], y_train.shape[1], classification="c", batch=1, accelerator="CPU")

    # Add hidden neurons, e.g., 100 neurons with sigmoid activation function
    elm_model.add_neurons(100, "sigm")

    # Train the ELM model
    elm_model.train(X_train, y_train, "c")

    # Predict using the trained model
    y_pred_train = elm_model.predict(X_train)
    y_pred_test = elm_model.predict(X_test)

    # Convert predictions to class labels
    y_pred_train_labels = y_pred_train.argmax(axis=1)
    y_pred_test_labels = y_pred_test.argmax(axis=1)

    # Convert one-hot encoded y_test back to class labels for evaluation
    y_test_labels = y_test.argmax(axis=1)

    # Adjust predicted labels back to original range (1 and 2)
    y_pred_test_labels_adjusted = y_pred_test_labels + 1
    y_test_labels_adjusted = y_test_labels + 1

    # Evaluate the model
    accuracy = accuracy_score(y_test_labels_adjusted, y_pred_test_labels_adjusted)
    precision = precision_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    recall = recall_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    f1 = f1_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    report = classification_report(y_test_labels_adjusted, y_pred_test_labels_adjusted, output_dict=True)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    classification_reports.append(report)

    print(f"Fold {fold + 1} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

# Aggregate the classification reports
average_classification_report = {}
for key in classification_reports[0].keys():
    if isinstance(classification_reports[0][key], dict):
        average_classification_report[key] = {}
        for sub_key in classification_reports[0][key].keys():
            average_classification_report[key][sub_key] = np.mean([report[key][sub_key] for report in classification_reports])
    else:
        average_classification_report[key] = np.mean([report[key] for report in classification_reports])

# Calculate average metrics
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)

print(f"Average Model Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print("Average Classification Report:")
for key, value in average_classification_report.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for sub_key, sub_value in value.items():
            print(f"    {sub_key}: {sub_value}")
    else:
        print(f"  {key}: {value}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Found NaN values in the target column. Filling with mode value.
Fold 1:
Training size: 9023
Testing size: 1003
Fold 1 Results:
Accuracy: 0.7876370887337986
Precision: 0.7914064081292582
Recall: 0.7876370887337986
F1 Score: 0.7843877555878361

Fold 2:
Training size: 9023
Testing size: 1003
Fold 2 Results:
Accuracy: 0.7666999002991027
Precision: 0.7693964348395147
Recall: 0.7666999002991027
F1 Score: 0.7631954975409905

Fold 3:
Training size: 9023
Testing size: 1003
Fold 3 Results:
Accuracy: 0.7676969092721835
Precision: 0.773708605691218
Recall: 0.7676969092721835
F1 Score: 0.7627282153352976

Fold 4:
Training size: 9023
Testing size: 1003
Fold 4 Results:
Accuracy: 0.7666999002991027
Precision: 0.7704927347383724
Recall: 0.7666999002991027
F1 Score: 0.7626580538719054

Fold 5:
Training size: 9023
Testing size: 1003
Fold 5 Results:
Accuracy: 0.7796610169491526
Precision: 0.7836588559314747
Recall: 0.7796610169491526
F1 Score: 0.7760379037577331

Fold 6:
Training size: 9023
Testing size: 