In [1]:
pip install pandas numpy scikit-learn nltk scipy



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack, csr_matrix
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')

# Load the dataset
file_path = '/content/PDFMalware2022.csv'
df = pd.read_csv(file_path)

# Handle missing values
# Fill missing numerical values with the median value of each column
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols.drop('Class')  # Exclude the target column
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Fill missing categorical values with the mode value of each column
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Check for and handle missing values in the target column
if df['Class'].isnull().any():
    print("Found NaN values in the target column. Filling with mode value.")
    df['Class'] = df['Class'].fillna(df['Class'].mode()[0])

# Tokenization and Hashing Vectorizer for text columns
text_cols = ['File name', 'text', 'header']
hashing_vectorizer = HashingVectorizer(n_features=2**14, alternate_sign=False)  # Reduced number of features for efficiency

# Apply tokenization and vectorization
text_features = [hashing_vectorizer.transform(df[col].apply(lambda x: ' '.join(word_tokenize(x)))) for col in text_cols]

# Combine text features into a single sparse matrix
text_features_combined = hstack(text_features)

# Separate numerical features and target
X_numerical = df[numerical_cols].values
y = df['Class'].values

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)

# Combine numerical and text features
X = hstack([csr_matrix(X_numerical), text_features_combined])

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=30, random_state=42)  # Further reduced number of components for efficiency
X_reduced = svd.fit_transform(X)

# Adjust labels for one-hot encoding
y_adjusted = y - 1

# One-hot encode the target labels
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_one_hot = one_hot_encoder.fit_transform(y_adjusted.reshape(-1, 1))

# Flatten the one-hot encoded labels for Random Forest
y_flat = np.argmax(y_one_hot, axis=1)

# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
classification_reports = []

for fold, (train_index, test_index) in enumerate(skf.split(X_reduced, y)):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y_flat[train_index], y_flat[test_index]

    # Define the Random Forest model with fewer estimators and enable parallel processing
    model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)  # Reduced number of trees

    # Train the Random Forest model
    model.fit(X_train, y_train)

    # Predict using the trained model
    y_pred_test = model.predict(X_test)

    # Adjust predicted labels back to original range (1 and 2)
    y_pred_test_labels_adjusted = y_pred_test + 1
    y_test_labels_adjusted = y_test + 1

    # Evaluate the model
    accuracy = accuracy_score(y_test_labels_adjusted, y_pred_test_labels_adjusted)
    precision = precision_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    recall = recall_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    f1 = f1_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    report = classification_report(y_test_labels_adjusted, y_pred_test_labels_adjusted, output_dict=True)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    classification_reports.append(report)

    print(f"Fold {fold + 1} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

# Aggregate the classification reports
average_classification_report = {}
for key in classification_reports[0].keys():
    if isinstance(classification_reports[0][key], dict):
        average_classification_report[key] = {}
        for sub_key in classification_reports[0][key].keys():
            average_classification_report[key][sub_key] = np.mean([report[key][sub_key] for report in classification_reports])
    else:
        average_classification_report[key] = np.mean([report[key] for report in classification_reports])

# Calculate average metrics
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)

print(f"Average Model Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print("Average Classification Report:")
for key, value in average_classification_report.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for sub_key, sub_value in value.items():
            print(f"    {sub_key}: {sub_value}")
    else:
        print(f"  {key}: {value}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Found NaN values in the target column. Filling with mode value.
Fold 1 Results:
Accuracy: 0.9750747756729811
Precision: 0.9751808702876702
Recall: 0.9750747756729811
F1 Score: 0.9750480938985008

Fold 2 Results:
Accuracy: 0.9780658025922233
Precision: 0.9782863648006926
Recall: 0.9780658025922233
F1 Score: 0.9780336768279901

Fold 3 Results:
Accuracy: 0.9690927218344965
Precision: 0.9691060508314628
Recall: 0.9690927218344965
F1 Score: 0.9690749807288369

Fold 4 Results:
Accuracy: 0.9680957128614157
Precision: 0.9680892484240324
Recall: 0.9680957128614157
F1 Score: 0.9680885851193631

Fold 5 Results:
Accuracy: 0.9830508474576272
Precision: 0.9831202943728157
Recall: 0.9830508474576272
F1 Score: 0.983036981485871

Fold 6 Results:
Accuracy: 0.9810568295114656
Precision: 0.9810617945515577
Recall: 0.9810568295114656
F1 Score: 0.9810504227379163

Fold 7 Results:
Accuracy: 0.9730538922155688
Precision: 0.9730717424573411
Recall: 0.9730538922155688
F1 Score: 0.9730382540036405

Fold 8 Result