# 1. Preparing the Zip File

## i. Mount Google Drive in Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## ii. Access the Shared Google Drive Folder

In [None]:
!ls /content/drive/MyDrive/ML_Cybersec-Lab01/

## iii. Unzip the File in Colab

In [None]:
!pip install gdown
!gdown --folder https://drive.google.com/drive/folders/1rFIuARy-FwO9j_43Fvp_YFCM5i5v32dB -O /content/drive/MyDrive/ML_Cybersec-Lab01/

In [None]:
import zipfile

# Unzipping the file in Colab
with zipfile.ZipFile('/content/drive/MyDrive/ML_Cybersec-Lab01/NYU Malware Homework-1/Hw1.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/Hw1')

# 2. Extracting Data from JSON Files

## i. Load and Parse Multiple JSON Files and Feature Extraction

In [None]:
!rm -rf /content/Hw1/__MACOSX

In [None]:
import os
import json
import pandas as pd

# Define the path to the root folder containing 'Benign' and 'Malware' folders
root_folder = '/content/Hw1'  # Modify this to match your folder path

# Function to extract features from JSON files
def extract_features_from_json(json_data):
    features = {}

    # Extract general file information

    features['file_size'] = json_data.get('target', {}).get('file', {}).get('size', 0)

    features['score'] = json_data.get('info', {}).get('score', 0)
    features['duration'] = json_data.get('info', {}).get('duration', 0)

    # Extract network features (defaulting to 0 if not available)
    network_data = json_data.get('network', {})
    features['udp_count'] = len(network_data.get('udp', []))
    features['tcp_count'] = len(network_data.get('tcp', []))
    features['icmp_count'] = len(network_data.get('icmp', []))

    # Extract behavior features (API calls, registry accesses)
    behavior_data = json_data.get('behavior', {}).get('generic', [])
    dlls_loaded = []
    reg_keys_opened = []

    for behavior in behavior_data:
        dlls_loaded.extend(behavior.get('summary', {}).get('dll_loaded', []))
        reg_keys_opened.extend(behavior.get('summary', {}).get('regkey_opened', []))

    features['dlls_loaded_count'] = len(dlls_loaded)
    features['reg_keys_opened_count'] = len(reg_keys_opened)

    return features

# Function to load all JSON files and extract features
def load_and_extract_features(folder_path):
    data = []
    labels = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    # Attempt to open with UTF-8 first
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read().strip()
                        # Check if file is empty
                        if not content:
                            print(f"Skipping empty file: {file_path}")
                            continue
                        json_data = json.loads(content)
                except UnicodeDecodeError:
                    # If UTF-8 fails, try ISO-8859-1 (Latin-1) encoding
                    try:
                        with open(file_path, 'r', encoding='ISO-8859-1') as f:
                            content = f.read().strip()
                            if not content:
                                print(f"Skipping empty file: {file_path}")
                                continue
                            json_data = json.loads(content)
                    except json.JSONDecodeError as e:
                        print(f"Skipping invalid JSON file: {file_path} (Error: {e})")
                        continue
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON file: {file_path} (Error: {e})")
                    continue

                # Extract features and append them to data
                features = extract_features_from_json(json_data)

                # Append features and label (0 for benign, 1 for malware)
                if 'Benign' in root:
                    labels.append(0)
                else:
                    labels.append(1)

                data.append(features)

    return pd.DataFrame(data), labels

# Load JSON data from 'Benign' and 'Malware' folders
df, labels = load_and_extract_features(root_folder)

# Add labels to the dataframe
df['label'] = labels

# Specify full path if needed
output_csv_path = '/content/drive/MyDrive/ML_Cybersec-Lab01/malware_detection_data.csv'

# Save the extracted features to a CSV file
df.to_csv(output_csv_path, index=False)
print(f"Features extracted and saved to '{output_csv_path}'")


Features extracted and saved to '/content/drive/MyDrive/ML_Cybersec-Lab01/malware_detection_data.csv'
