In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np

In [2]:
# File path to the dataset
file_path = "/content/drive/MyDrive/GUIDE_Train.csv.zip"

# Define the chunk size (e.g., 100,000 rows at a time)
chunk_size = 100000

# List of columns to remove
columns_to_remove = [
    'MitreTechniques', 'ActionGrouped', 'ActionGranular', 'EmailClusterId',
    'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection',
    'SuspicionLevel', 'LastVerdict'
]

# Initialize an empty list to store cleaned chunks
cleaned_chunks = []

# Read the dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):
    # Drop the specified columns from the chunk
    chunk_cleaned = chunk.drop(columns=columns_to_remove, errors='ignore')

    # Append the cleaned chunk to the list
    cleaned_chunks.append(chunk_cleaned)

    # Print the remaining columns in the current chunk for verification
    print(f"Remaining columns in the current chunk: {list(chunk_cleaned.columns)}")

# Concatenate all cleaned chunks into a single DataFrame
final_cleaned_df = pd.concat(cleaned_chunks, ignore_index=True)

# Print the shape of the final cleaned DataFrame
print(f"Shape of the final DataFrame: {final_cleaned_df.shape}")

# Optionally, save the final DataFrame to a new CSV file
final_cleaned_df.to_csv("/content/drive/MyDrive/GUIDE_Train_Cleaned.csv", index=False)
print("Final cleaned file saved successfully.")


Remaining columns in the current chunk: ['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId', 'AlertTitle', 'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath', 'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State', 'City']
Remaining columns in the current chunk: ['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId', 'AlertTitle', 'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath', 'Reso

In [4]:
final_cleaned_df.shape

(9516837, 35)

In [3]:
null_col= final_cleaned_df.columns[final_cleaned_df.isna().any()]
null_counts = final_cleaned_df[null_col].isna().sum()
print(null_counts)

IncidentGrade    51340
dtype: int64


In [4]:
final_cleaned_df.dropna(subset=['IncidentGrade'], inplace=True)

In [8]:
final_cleaned_df.shape

(9465497, 35)

In [5]:
final_cleaned_df['Timestamp'] = pd.to_datetime(final_cleaned_df['Timestamp'])

In [6]:
final_cleaned_df['Timestamp'].head()

Unnamed: 0,Timestamp
0,2024-06-04 06:05:15+00:00
1,2024-06-14 03:01:25+00:00
2,2024-06-13 04:52:55+00:00
3,2024-06-10 16:39:36+00:00
4,2024-06-15 01:08:07+00:00


In [7]:
# Extract day, month, and hour from the Timestamp
final_cleaned_df['Day'] = final_cleaned_df['Timestamp'].dt.day
final_cleaned_df['Month'] = final_cleaned_df['Timestamp'].dt.month
final_cleaned_df['Hour'] = final_cleaned_df['Timestamp'].dt.hour
final_cleaned_df['Year'] = final_cleaned_df['Timestamp'].dt.year


# Drop the original Timestamp column
final_cleaned_df.drop('Timestamp', axis=1, inplace=True)

print(final_cleaned_df.head())

              Id  OrgId  IncidentId  AlertId  DetectorId  AlertTitle  \
0   180388628218      0         612   123247           7           6   
1   455266534868     88         326   210035          58          43   
2  1056561957389    809       58352   712507         423         298   
3  1279900258736     92       32992   774301           2           2   
4   214748368522    148        4359   188041           9          74   

            Category   IncidentGrade EntityType EvidenceRole  ...  \
0      InitialAccess    TruePositive         Ip      Related  ...   
1       Exfiltration   FalsePositive       User     Impacted  ...   
2      InitialAccess   FalsePositive        Url      Related  ...   
3  CommandAndControl  BenignPositive        Url      Related  ...   
4          Execution    TruePositive       User     Impacted  ...   

   ResourceIdName  OSFamily  OSVersion  CountryCode  State   City  Day  Month  \
0            3586         5         66           31      6      3    4 

In [8]:
final_cleaned_df.columns

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'DeviceId',
       'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
       'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
       'RegistryKey', 'RegistryValueName', 'RegistryValueData',
       'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName',
       'FolderPath', 'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode',
       'State', 'City', 'Day', 'Month', 'Hour', 'Year'],
      dtype='object')

In [9]:
# Convert x to a set for comparison
x_set = set([
    'Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
    'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'DeviceId',
    'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
    'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
    'RegistryKey', 'RegistryValueName', 'RegistryValueData',
    'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName',
    'FolderPath', 'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode',
    'State', 'City', 'Day', 'Month', 'Hour', 'Year'
])

# Convert columns_to_keep to a set for comparison
columns_to_keep_set = set([
    'Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
    'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'DeviceId',
    'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'DeviceName',
    'NetworkMessageId', 'RegistryKey', 'RegistryValueName', 'ApplicationId',
    'OAuthApplicationId', 'ResourceIdName', 'OSFamily', 'CountryCode',
    'Usage', 'Day', 'Month', 'Hour', 'Year'
])

# Find columns in x that are missing in columns_to_keep
missing_in_columns_to_keep = x_set - columns_to_keep_set

# Print the result
print("Columns in x but missing in columns_to_keep:", missing_in_columns_to_keep)


Columns in x but missing in columns_to_keep: {'State', 'AccountName', 'City', 'AccountObjectId', 'FolderPath', 'ApplicationName', 'OSVersion', 'FileName', 'RegistryValueData'}


In [10]:
final_cleaned_col_df = final_cleaned_df.drop(columns=missing_in_columns_to_keep)
final_cleaned_col_df.shape

(9465497, 29)

In [11]:
final_cleaned_col_df.to_csv("df_train_not_encoded.csv",index=False)