In [1]:
pip install pandas numpy scikit-learn nltk scipy tensorflow




In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, csr_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Download NLTK data
nltk.download('punkt')

# Load the dataset
file_path = '/content/PDFMalware2022.csv'
df = pd.read_csv(file_path)

# Handle missing values
# Fill missing categorical values with the mode value of each column
categorical_cols = ['File name', 'text', 'header']
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Check for and handle missing values in the target column
if df['Class'].isnull().any():
    print("Found NaN values in the target column. Filling with mode value.")
    df['Class'] = df['Class'].fillna(df['Class'].mode()[0])

# Encode target variable
df['Class'] = df['Class'].replace({1: 1, 2: 2})

# Tokenization and Hashing Vectorizer for text columns
text_cols = ['File name', 'text', 'header']
hashing_vectorizer = HashingVectorizer(n_features=2**16, alternate_sign=False)

# Apply tokenization and vectorization
text_features = [hashing_vectorizer.transform(df[col].apply(lambda x: ' '.join(word_tokenize(x)))) for col in text_cols]

# Combine text features into a single sparse matrix
text_features_combined = hstack(text_features)

# Separate features and target
y = df['Class'].values

# Combine text features (no numerical features in this dataset)
X = text_features_combined

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

# Adjust labels for one-hot encoding
y_adjusted = y - 1

# One-hot encode the target labels
y_one_hot = to_categorical(y_adjusted)

# Reshape the data for LSTM
X_reduced = X_reduced.reshape((X_reduced.shape[0], 1, X_reduced.shape[1]))

# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
classification_reports = []

for fold, (train_index, test_index) in enumerate(skf.split(X_reduced, y)):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y_one_hot[train_index], y_one_hot[test_index]

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(32))
    model.add(Dropout(0.2))
    model.add(Dense(y_train.shape[1], activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Implement early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)

    # Predict using the trained model
    y_pred_test = model.predict(X_test)

    # Convert predictions to class labels
    y_pred_test_labels = y_pred_test.argmax(axis=1)
    y_test_labels = y_test.argmax(axis=1)

    # Adjust predicted labels back to original range (1 and 2)
    y_pred_test_labels_adjusted = y_pred_test_labels + 1
    y_test_labels_adjusted = y_test_labels + 1

    # Evaluate the model
    accuracy = accuracy_score(y_test_labels_adjusted, y_pred_test_labels_adjusted)
    precision = precision_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    recall = recall_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    f1 = f1_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    report = classification_report(y_test_labels_adjusted, y_pred_test_labels_adjusted, output_dict=True)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    classification_reports.append(report)

    print(f"Fold {fold + 1} Results:")
    print(f"Training Data Size: {len(train_index)}")
    print(f"Testing Data Size: {len(test_index)}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

# Aggregate the classification reports
average_classification_report = {}
for key in classification_reports[0].keys():
    if isinstance(classification_reports[0][key], dict):
        average_classification_report[key] = {}
        for sub_key in classification_reports[0][key].keys():
            average_classification_report[key][sub_key] = np.mean([report[key][sub_key] for report in classification_reports])
    else:
        average_classification_report[key] = np.mean([report[key] for report in classification_reports])

# Calculate average metrics
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)

print(f"Average Model Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print("Average Classification Report:")
for key, value in average_classification_report.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for sub_key, sub_value in value.items():
            print(f"    {sub_key}: {sub_value}")
    else:
        print(f"  {key}: {value}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Found NaN values in the target column. Filling with mode value.
Fold 1 Results:
Training Data Size: 9023
Testing Data Size: 1003
Accuracy: 0.7886340977068793
Precision: 0.792600058717136
Recall: 0.7886340977068793
F1 Score: 0.7853403823948565

Fold 2 Results:
Training Data Size: 9023
Testing Data Size: 1003
Accuracy: 0.7686939182452642
Precision: 0.7714785422399673
Recall: 0.7686939182452642
F1 Score: 0.7652194676474777

Fold 3 Results:
Training Data Size: 9023
Testing Data Size: 1003
Accuracy: 0.7647058823529411
Precision: 0.7714618314858411
Recall: 0.7647058823529411
F1 Score: 0.7592800877846141

Fold 4 Results:
Training Data Size: 9023
Testing Data Size: 1003
Accuracy: 0.7686939182452642
Precision: 0.7728956420415046
Recall: 0.7686939182452642
F1 Score: 0.7645480102290317

Fold 5 Results:
Training Data Size: 9023
Testing Data Size: 1003
Accuracy: 0.7806580259222333
Precision: 0.784859883160436
Recall: 0.7806580259222333
F1 Score: 0.7769873829520976

Fold 6 Results:
Training Data Siz