In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
spark = SparkSession.builder.appName('MalwareDetection').getOrCreate()

def extract_features(file_path): 
    byte_histogram = [0] * 256
    try: 
        with open(file_path, 'rb') as file: 
            content = file.read()
        for byte in content:
            byte_histogram[byte] += 1
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return byte_histogram

In [3]:
extract_features_udf = udf(lambda x: extract_features(x), ArrayType(IntegerType()))

train_folder = 'G:/Projects/Cyber_ai/train'  
test_folder = 'G:/Projects/Cyber_ai/test'

In [4]:
train_files = [os.path.join(train_folder, f) for f in os.listdir(train_folder)]  
test_files = [os.path.join(test_folder, f) for f in os.listdir(test_folder)]

In [5]:
train_df = spark.createDataFrame([(f,) for f in train_files], ["file_path"])
test_df = spark.createDataFrame([(f,) for f in test_files], ["file_path"])

In [6]:
train_df = train_df.withColumn('features', extract_features_udf(train_df.file_path)) 
test_df = test_df.withColumn('features', extract_features_udf(test_df.file_path))

In [8]:
train_features = train_df.select('features').collect()
test_features = test_df.select('features').collect()

In [9]:
train_labels = [1] * len(train_features) 
test_labels = [1] * len(test_features)

In [10]:
train_data = pd.DataFrame([row['features'] for row in train_features]) 
train_data['label'] = train_labels

test_data = pd.DataFrame([row['features'] for row in test_features]) 
test_data['label'] = test_labels

In [11]:
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
X_test.columns = X_test.columns.astype(str)

In [18]:
with open('malware_model.pkl', 'wb') as model_file:
    pickle.dump((model, scaler), model_file)

In [19]:
with open('malware_model.pkl', 'rb') as model_file:
    model, scaler = pickle.load(model_file)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)



In [20]:
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 Score: 100.00%


In [21]:
spark.stop()