In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')

In [3]:
# Inspect the first few rows
print(df.head())

              Id  OrgId  IncidentId  AlertId                 Timestamp  \
0   180388628218      0         612   123247  2024-06-04T06:05:15.000Z   
1   455266534868     88         326   210035  2024-06-14T03:01:25.000Z   
2  1056561957389    809       58352   712507  2024-06-13T04:52:55.000Z   
3  1279900258736     92       32992   774301  2024-06-10T16:39:36.000Z   
4   214748368522    148        4359   188041  2024-06-15T01:08:07.000Z   

   DetectorId  AlertTitle           Category MitreTechniques   IncidentGrade  \
0           7           6      InitialAccess             NaN    TruePositive   
1          58          43       Exfiltration             NaN   FalsePositive   
2         423         298      InitialAccess           T1189   FalsePositive   
3           2           2  CommandAndControl             NaN  BenignPositive   
4           9          74          Execution             NaN    TruePositive   

   ... ResourceType Roles OSFamily OSVersion  AntispamDirection  \
0  ... 

In [4]:
# Summary statistics
print(df.describe())

                 Id         OrgId    IncidentId       AlertId    DetectorId  \
count  9.516837e+06  9.516837e+06  9.516837e+06  9.516837e+06  9.516837e+06   
mean   8.425494e+11  1.815800e+02  7.066349e+04  4.065188e+05  1.106724e+02   
std    4.962499e+11  3.867784e+02  1.208369e+05  4.592827e+05  4.351038e+02   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    4.123169e+11  1.000000e+01  5.040000e+02  2.324200e+04  2.000000e+00   
50%    8.418136e+11  4.500000e+01  1.033600e+04  2.166520e+05  9.000000e+00   
75%    1.271310e+12  1.710000e+02  8.432900e+04  6.715770e+05  4.500000e+01   
max    1.709397e+12  6.147000e+03  5.997060e+05  1.721456e+06  9.522000e+03   

         AlertTitle      DeviceId        Sha256     IpAddress           Url  \
count  9.516837e+06  9.516837e+06  9.516837e+06  9.516837e+06  9.516837e+06   
mean   2.947315e+03  9.566476e+04  1.287191e+05  2.857506e+05  1.503317e+05   
std    1.146150e+04  1.635288e+04  3.399208e+04  1.

In [5]:
# Check data types and missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9516837 entries, 0 to 9516836
Data columns (total 45 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  int64  
 1   OrgId               int64  
 2   IncidentId          int64  
 3   AlertId             int64  
 4   Timestamp           object 
 5   DetectorId          int64  
 6   AlertTitle          int64  
 7   Category            object 
 8   MitreTechniques     object 
 9   IncidentGrade       object 
 10  ActionGrouped       object 
 11  ActionGranular      object 
 12  EntityType          object 
 13  EvidenceRole        object 
 14  DeviceId            int64  
 15  Sha256              int64  
 16  IpAddress           int64  
 17  Url                 int64  
 18  AccountSid          int64  
 19  AccountUpn          int64  
 20  AccountObjectId     int64  
 21  AccountName         int64  
 22  DeviceName          int64  
 23  NetworkMessageId    int64  
 24  EmailClusterId      floa

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# 1. Data Exploration and Understanding
def load_and_explore_data(file_path):
    df = pd.read_csv(file_path)
    print(df.info())
    print(df.describe())
    print(df['target'].value_counts(normalize=True))
    
    # Visualize target distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='target', data=df)
    plt.title('Distribution of Target Variable')
    plt.show()
    
    return df

In [8]:
# 2. Data Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def preprocess_data(df):
    # Identify numeric and categorical columns
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
    # Create preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

In [9]:
# 3. Data Splitting and Stratification
from sklearn.model_selection import train_test_split, cross_val_score

def split_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    return X_train, X_test, y_train, y_test


In [10]:
# 4. Model Selection and Training
from sklearn.ensemble import RandomForestClassifier

def train_model(X_train, y_train, preprocessor):
    # Use Random Forest as an example
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    model.fit(X_train, y_train)
    return model

In [11]:
# 5. Model Evaluation and Tuning
from sklearn.metrics import classification_report, confusion_matrix


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [12]:
# 6. Model Interpretation
def interpret_model(model, X):
    feature_importance = model.named_steps['classifier'].feature_importances_
    feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    
    importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
    importance_df = importance_df.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title('Top 20 Feature Importances')
    plt.show()

#### 7 and 8 

In [13]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
df = pd.read_csv('train.csv')
print(df.columns)


Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId',
       'AlertTitle', 'Category', 'MitreTechniques', 'IncidentGrade',
       'ActionGrouped', 'ActionGranular', 'EntityType', 'EvidenceRole',
       'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
       'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
       'EmailClusterId', 'RegistryKey', 'RegistryValueName',
       'RegistryValueData', 'ApplicationId', 'ApplicationName',
       'OAuthApplicationId', 'ThreatFamily', 'FileName', 'FolderPath',
       'ResourceIdName', 'ResourceType', 'Roles', 'OSFamily', 'OSVersion',
       'AntispamDirection', 'SuspicionLevel', 'LastVerdict', 'CountryCode',
       'State', 'City'],
      dtype='object')


In [15]:
def load_and_explore_data(file_path, target_column=None):
    df = pd.read_csv(file_path)
    print(df.info())
    print(df.describe())
    
    if target_column and target_column in df.columns:
        print(df[target_column].value_counts(normalize=True))
        
        # Visualize target distribution
        plt.figure(figsize=(10, 6))
        sns.countplot(x=target_column, data=df)
        plt.title('Distribution of Target Variable')
        plt.show()
    else:
        print(f"Column '{target_column}' not found in the dataset.")

    return df


In [16]:
# 2. Data Preprocessing
def preprocess_data(df):
    # Identify numeric and categorical columns
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
    # Create preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor



In [17]:
# 3. Data Splitting and Stratification
def split_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    return X_train, X_test, y_train, y_test

In [18]:
# 4. Model Selection and Training
def train_model(X_train, y_train, preprocessor):
    # Use Random Forest as an example
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    model.fit(X_train, y_train)
    return model

In [19]:
# 5. Model Evaluation and Tuning
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [20]:
# 6. Model Interpretation
def interpret_model(model, X):
    feature_importance = model.named_steps['classifier'].feature_importances_
    feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    
    importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
    importance_df = importance_df.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title('Top 20 Feature Importances')
    plt.show()

In [21]:
# if __name__ == "__main__":
#     # Load and explore data
#     df = load_and_explore_data('train.csv')
    
#     # Print column names
#     print("\nAvailable columns in the dataset:")
#     for i, col in enumerate(df.columns):
#         print(f"{i}: {col}")
    
#     # Ask user to input the index of the target column
#     target_column_index = int(input("\nPlease enter the index number of the target column (triage grade): "))
#     target_column = df.columns[target_column_index]
    
#     print(f"Selected target column: {target_column}")
    
#     # Load and explore data with the specified target column
#     df = load_and_explore_data('train.csv', target_column=target_column)
    
#     # Preprocess data
#     preprocessor = preprocess_data(df)
    
#     # Split data
#     X_train, X_test, y_train, y_test = split_data(df, target_column)
    
#     # Train model
#     model = train_model(X_train, y_train, preprocessor)
    
#     # Evaluate model
#     evaluate_model(model, X_test, y_test)
    
#     # Interpret model
#     interpret_model(model, X_train)


In [22]:

# # Main execution
# if __name__ == "__main__":
#     # Load and explore data without specifying target column
#     df = load_and_explore_data('train.csv')
    
#     # Print column names
#     print("\nAvailable columns in the dataset:")
#     for i, col in enumerate(df.columns):
#         print(f"{i}: {col}")
    
#     # Ask user to input the index of the target column
#     target_column_index = int(input("\nPlease enter the index number of the target column (triage grade): "))
#     target_column = df.columns[target_column_index]
    
#     print(f"Selected target column: {target_column}")
    
#     # Now proceed with the analysis using the selected target column
#     df = load_and_explore_data('train.csv', target_column=target_column)
    
#     # Preprocess data
#     preprocessor = preprocess_data(df)
    
#     # Split data
#     X_train, X_test, y_train, y_test = split_data(df, target_column)
    
#     # Train model
#     model = train_model(X_train, y_train, preprocessor)
    
#     # Evaluate model
#     evaluate_model(model, X_test, y_test)
    
#     # Interpret model
#     interpret_model(model, X_train)
    
#     # Final evaluation on test set
#     test_df = pd.read_csv('test.csv')
#     if target_column in test_df.columns:
#         X_test_final = test_df.drop(target_column, axis=1)
#         y_test_final = test_df[target_column]
        
#         print("Final Test Set Evaluation:")
#         evaluate_model(model, X_test_final, y_test_final)
#     else:
#         print(f"Target column '{target_column}' not found in test set. Unable to perform final evaluation.")

In [23]:
# 7. Final Evaluation on Test Set
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score


def final_evaluation_on_test_set(model, test_df, target_column):
    # Check if target column exists in test set
    if target_column in test_df.columns:
        # Prepare test data
        X_test_final = test_df.drop(target_column, axis=1)
        y_test_final = test_df[target_column]
        
        # Make predictions
        y_pred_final = model.predict(X_test_final)
        
        # Compute metrics
        f1 = f1_score(y_test_final, y_pred_final, average='macro')
        precision = precision_score(y_test_final, y_pred_final, average='macro')
        recall = recall_score(y_test_final, y_pred_final, average='macro')
        
        print(f"Final Test Set Evaluation:")
        print(f"Macro-F1 Score: {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        
        # Detailed classification report
        print("\nClassification Report:")
        print(classification_report(y_test_final, y_pred_final))
        
        return f1, precision, recall
    else:
        print(f"Target column '{target_column}' not found in test set. Unable to perform final evaluation.")
        return None, None, None

In [24]:
# Baseline Model Evaluation
from sklearn.dummy import DummyClassifier


def evaluate_baseline_model(X_train, y_train, X_test, y_test):
    # Define a baseline model
    baseline_model = DummyClassifier(strategy='most_frequent')
    
    # Train the baseline model
    baseline_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_baseline = baseline_model.predict(X_test)
    
    # Compute metrics
    baseline_f1 = f1_score(y_test, y_pred_baseline, average='macro')
    baseline_precision = precision_score(y_test, y_pred_baseline, average='macro')
    baseline_recall = recall_score(y_test, y_pred_baseline, average='macro')
    
    print("Baseline Model Evaluation:")
    print(f"Macro-F1 Score: {baseline_f1:.4f}")
    print(f"Precision: {baseline_precision:.4f}")
    print(f"Recall: {baseline_recall:.4f}")
    
    return baseline_f1, baseline_precision, baseline_recall

In [25]:
# Main execution
if __name__ == "__main__":
    # Load and explore data
    df = load_and_explore_data('train.csv')

    # Print column names
    print("\nAvailable columns in the dataset:")
    for i, col in enumerate(df.columns):
        print(f"{i}: {col}")

    # Ask user to input the index of the target column
    target_column_index = int(input("\nPlease enter the index number of the target column: "))
    target_column = df.columns[target_column_index]

    print(f"Selected target column: {target_column}")

    # Load and explore data with the specified target column
    df = load_and_explore_data('train.csv', target_column=target_column)

    # Preprocess data
    preprocessor = preprocess_data(df)

    # Split data
    X_train, X_test, y_train, y_test = split_data(df, target_column)

    # Train model
    model = train_model(X_train, y_train, preprocessor)

    # Evaluate model
    evaluate_model(model, X_test, y_test)

    # Interpret model
    interpret_model(model, X_train)

    # Evaluate baseline model
    baseline_f1, baseline_precision, baseline_recall = evaluate_baseline_model(X_train, y_train, X_test, y_test)

    # Final evaluation on test set
    test_df = pd.read_csv('test.csv')
    final_evaluation_on_test_set(model, test_df, target_column)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9516837 entries, 0 to 9516836
Data columns (total 45 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  int64  
 1   OrgId               int64  
 2   IncidentId          int64  
 3   AlertId             int64  
 4   Timestamp           object 
 5   DetectorId          int64  
 6   AlertTitle          int64  
 7   Category            object 
 8   MitreTechniques     object 
 9   IncidentGrade       object 
 10  ActionGrouped       object 
 11  ActionGranular      object 
 12  EntityType          object 
 13  EvidenceRole        object 
 14  DeviceId            int64  
 15  Sha256              int64  
 16  IpAddress           int64  
 17  Url                 int64  
 18  AccountSid          int64  
 19  AccountUpn          int64  
 20  AccountObjectId     int64  
 21  AccountName         int64  
 22  DeviceName          int64  
 23  NetworkMessageId    int64  
 24  EmailClusterId      floa

In [None]:
# df = pd.read_csv('train.csv')
# print(df.head())  # Check the first few rows to confirm correct loading
