In [1]:
pip install pandas numpy scikit-learn nltk scipy



In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import hstack, csr_matrix
import gc
import nltk
from nltk.tokenize import word_tokenize
import time

# Download NLTK data
nltk.download('punkt')

# Load the dataset
file_path = '/content/PDFMalware2022.csv'
df = pd.read_csv(file_path)

# Handle missing values
# Fill missing categorical values with the mode value of each column
categorical_cols = ['File name', 'text', 'header']
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Check for and handle missing values in the target column
if df['Class'].isnull().any():
    print("Found NaN values in the target column. Filling with mode value.")
    df['Class'] = df['Class'].fillna(df['Class'].mode()[0])

# Tokenization and Hashing Vectorizer for text columns
text_cols = ['File name', 'text', 'header']
hashing_vectorizer = HashingVectorizer(n_features=2**10, alternate_sign=False)  # Reduced number of features for efficiency

# Apply tokenization and vectorization
text_features = [hashing_vectorizer.transform(df[col].apply(lambda x: ' '.join(word_tokenize(x)))) for col in text_cols]

# Combine text features into a single sparse matrix
text_features_combined = hstack(text_features)

# Separate target
y = df['Class'].values

# Combine text features (no numerical features in this dataset)
X = text_features_combined

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

# Adjust labels for one-hot encoding
y_adjusted = y - 1

# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
classification_reports = []

# Measure total execution time
start_time = time.time()

for fold, (train_index, test_index) in enumerate(skf.split(X_reduced, y)):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y_adjusted[train_index], y_adjusted[test_index]

    print(f"Fold {fold + 1}:")
    print(f"Training data size: {len(train_index)}")
    print(f"Testing data size: {len(test_index)}")

    # Initialize the Decision Tree model
    model = DecisionTreeClassifier(criterion='entropy')

    # Train the model
    model.fit(X_train, y_train)

    # Predict using the trained model
    y_pred_test = model.predict(X_test)

    # Adjust predicted labels back to original range (1 and 2)
    y_pred_test_labels_adjusted = y_pred_test + 1
    y_test_labels_adjusted = y_test + 1

    # Evaluate the model
    accuracy = accuracy_score(y_test_labels_adjusted, y_pred_test_labels_adjusted)
    precision = precision_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    recall = recall_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    f1 = f1_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    report = classification_report(y_test_labels_adjusted, y_pred_test_labels_adjusted, output_dict=True)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    classification_reports.append(report)

    print(f"Fold {fold + 1} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

    # Free up memory
    del X_train, X_test, y_train, y_test, model, y_pred_test
    gc.collect()

# Measure total execution time
end_time = time.time()
execution_time = end_time - start_time

print(f"Total execution time: {execution_time} seconds")

# Aggregate the classification reports
average_classification_report = {}
for key in classification_reports[0].keys():
    if isinstance(classification_reports[0][key], dict):
        average_classification_report[key] = {}
        for sub_key in classification_reports[0][key].keys():
            average_classification_report[key][sub_key] = np.mean([report[key][sub_key] for report in classification_reports])
    else:
        average_classification_report[key] = np.mean([report[key] for report in classification_reports])

# Calculate average metrics
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)

print(f"Average Model Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print("Average Classification Report:")
for key, value in average_classification_report.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for sub_key, sub_value in value.items():
            print(f"    {sub_key}: {sub_value}")
    else:
        print(f"  {key}: {value}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Found NaN values in the target column. Filling with mode value.
Fold 1:
Training data size: 9023
Testing data size: 1003
Fold 1 Results:
Accuracy: 0.6939182452642074
Precision: 0.6938455670129441
Recall: 0.6939182452642074
F1 Score: 0.688706113022196

Fold 2:
Training data size: 9023
Testing data size: 1003
Fold 2 Results:
Accuracy: 0.6959122632103689
Precision: 0.69542321330275
Recall: 0.6959122632103689
F1 Score: 0.691428954292019

Fold 3:
Training data size: 9023
Testing data size: 1003
Fold 3 Results:
Accuracy: 0.6979062811565304
Precision: 0.6983493545175395
Recall: 0.6979062811565304
F1 Score: 0.6924000047993957

Fold 4:
Training data size: 9023
Testing data size: 1003
Fold 4 Results:
Accuracy: 0.7168494516450648
Precision: 0.7178559363232616
Recall: 0.7168494516450648
F1 Score: 0.7121109852654349

Fold 5:
Training data size: 9023
Testing data size: 1003
Fold 5 Results:
Accuracy: 0.7008973080757727
Precision: 0.6995416736462547
Recall: 0.7008973080757727
F1 Score: 0.6992000431051