<a href="https://colab.research.google.com/github/LiyangPang/Berkeley-CS61B/blob/main/Microsoft_Malware_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle



In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c microsoft-malware-prediction

Downloading microsoft-malware-prediction.zip to /content
 99% 1.53G/1.54G [00:17<00:00, 114MB/s]
100% 1.54G/1.54G [00:17<00:00, 95.7MB/s]


In [None]:
! unzip microsoft-malware-prediction.zip

Archive:  microsoft-malware-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate

import gc
gc.enable()

# Read Data

In [None]:
from sklearn.metrics import classification_report


In [None]:
# Define data types for memory efficiency
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:

print('Download Train and Test Data.\n')
train = pd.read_csv('train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')

Download Train and Test Data.



In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7853253 entries, 0 to 7853252
Data columns (total 82 columns):
 #   Column                                             Dtype   
---  ------                                             -----   
 0   MachineIdentifier                                  uint64  
 1   ProductName                                        category
 2   EngineVersion                                      category
 3   AppVersion                                         category
 4   AvSigVersion                                       category
 5   IsBeta                                             int8    
 6   RtpStateBitfield                                   float16 
 7   IsSxsPassiveMode                                   int8    
 8   DefaultBrowsersIdentifier                          float16 
 9   AVProductStatesIdentifier                          float32 
 10  AVProductsInstalled                                float16 
 11  AVProductsEnabled                    

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8921483 entries, 0 to 8921482
Data columns (total 83 columns):
 #   Column                                             Dtype   
---  ------                                             -----   
 0   MachineIdentifier                                  uint64  
 1   ProductName                                        category
 2   EngineVersion                                      category
 3   AppVersion                                         category
 4   AvSigVersion                                       category
 5   IsBeta                                             int8    
 6   RtpStateBitfield                                   float16 
 7   IsSxsPassiveMode                                   int8    
 8   DefaultBrowsersIdentifier                          float16 
 9   AVProductStatesIdentifier                          float32 
 10  AVProductsInstalled                                float16 
 11  AVProductsEnabled                    

# LightGBM Gradient Boosted Decision Tree

In [None]:
# Use a smaller sample due to RAM limitations
subset = train.sample(n=500000, random_state=42)
X_train = subset.drop('HasDetections', axis=1)
Y_train = subset['HasDetections']

In [None]:
# Preprocessing - encode categorical columns into numerical format
category_columns = X_train.select_dtypes(include=['category']).columns

category_columns_array = category_columns.tolist()

le = LabelEncoder()
for category in category_columns_array:
  X_train[category] = le.fit_transform(X_train[category])

In [None]:
# Function to report metrics

scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']
metrics = ['test_accuracy', 'test_precision', 'test_recall', 'test_f1']

def print_metrics(results, metrics=metrics):
  for metric in metrics:
    print(f"{metric}:")
    for i, score in enumerate(results[metric]):
      print(f"Fold {i+1}: {round(score, 3)}", end="\t")
    print("\n")


In [None]:
parameters = {
    'objective': 'binary',
    'boosting_type': 'gbdt'
}

# stratified 5-fold cross validation
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)



In [None]:
# No regularization

cv_results = cross_validate(
    lgb.LGBMClassifier(**parameters),
    X_train, Y_train,
    cv=skf,
    scoring=scoring_metrics,
    verbose=1,
)

print_metrics(cv_results)

[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.200240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5327
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499905 -> initscore=-0.000380
[LightGBM] [Info] Start training from score -0.000380
[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.196920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5281
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [

In [None]:
# L1 regularization
parameters['lambda_l1'] = 0.1

cv_results_l1 = cross_validate(
    lgb.LGBMClassifier(**parameters),
    X_train, Y_train,
    cv=skf,
    scoring=scoring_metrics,
    verbose=1,
)

print_metrics(cv_results_l1)

[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.196050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5327
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499905 -> initscore=-0.000380
[LightGBM] [Info] Start training from score -0.000380
[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.298405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5281
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [

In [None]:
# L2 regularization
parameters['lambda_l1'] = 0
parameters['lambda_l2'] = 0.1
cv_results_l2 = cross_validate(
    lgb.LGBMClassifier(**parameters),
    X_train, Y_train,
    cv=skf,
    scoring=scoring_metrics,
    verbose=1,
)

print_metrics(cv_results_l2)

[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.194105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5327
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499905 -> initscore=-0.000380
[LightGBM] [Info] Start training from score -0.000380
[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.283249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5281
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [

In [None]:
# l1 and l2 regularization
parameters['lambda_l1'] = 0.1
cv_results_l1_l2 = cross_validate(
    lgb.LGBMClassifier(**parameters),
    X_train, Y_train,
    cv=skf,
    scoring=scoring_metrics,
    verbose=1,
)

print_metrics(cv_results_l1_l2)

[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.190087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5327
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499905 -> initscore=-0.000380
[LightGBM] [Info] Start training from score -0.000380
[LightGBM] [Info] Number of positive: 199962, number of negative: 200038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.236498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5281
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 80
[LightGBM] [

# scikit-learn Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

clf = HistGradientBoostingClassifier(random_state=42)
stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

In [None]:
# No regularization
results = cross_validate(clf, X_train, Y_train, cv=stratified_kfold, scoring=scoring_metrics)
print_metrics(results)

test_accuracy:
Fold 1: 0.651	Fold 2: 0.65	Fold 3: 0.65	Fold 4: 0.651	Fold 5: 0.65	

test_precision:
Fold 1: 0.654	Fold 2: 0.655	Fold 3: 0.653	Fold 4: 0.652	Fold 5: 0.652	

test_recall:
Fold 1: 0.64	Fold 2: 0.637	Fold 3: 0.642	Fold 4: 0.645	Fold 5: 0.643	

test_f1:
Fold 1: 0.647	Fold 2: 0.645	Fold 3: 0.647	Fold 4: 0.649	Fold 5: 0.647	



In [None]:
# Find best value for L2 regularization

from sklearn.model_selection import GridSearchCV

param_grid = {'l2_regularization': [0.01, 0.1, 0.5, 1.0]}
grid_search = GridSearchCV(HistGradientBoostingClassifier(), param_grid, cv=5)
grid_search.fit(X_train, Y_train)
best_l2_regularization = grid_search.best_params_['l2_regularization']
print(best_l2_regularization)


0.5


In [None]:
# L2 regularization
new_params = {'l2_regularization': best_l2_regularization}
clf.set_params(**new_params)
results_l2 = cross_validate(clf, X_train, Y_train, cv=stratified_kfold, scoring=scoring_metrics)

print_metrics(results_l2)

test_accuracy:
Fold 1: 0.651	Fold 2: 0.65	Fold 3: 0.65	Fold 4: 0.651	Fold 5: 0.65	

test_precision:
Fold 1: 0.654	Fold 2: 0.654	Fold 3: 0.653	Fold 4: 0.653	Fold 5: 0.652	

test_recall:
Fold 1: 0.641	Fold 2: 0.635	Fold 3: 0.641	Fold 4: 0.646	Fold 5: 0.645	

test_f1:
Fold 1: 0.648	Fold 2: 0.644	Fold 3: 0.647	Fold 4: 0.649	Fold 5: 0.648	



# Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

# Selecting a subset of the data
train_sample = train.sample(n=100000, random_state=1)

# Splitting into features and target
X_train = train_sample.drop('HasDetections', axis=1)
y_train = train_sample['HasDetections']

# Selecting categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['category', 'object']).columns
numerical_cols = X_train.select_dtypes(include=['int8', 'int16', 'int32', 'float16', 'float32', 'float64']).columns

# Creating a preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocessing the data
X_train_preprocessed = preprocessor.fit_transform(X_train)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

# Splitting the preprocessed data for validation
X_train, X_val, y_train, y_val = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=0)

# Building the Logistic Regression model
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)

# Predicting on the validation set
y_pred = model.predict(X_val)

# Evaluating the model
print(classification_report(y_val, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.62      0.63      0.62     10020
           1       0.62      0.61      0.61      9980

    accuracy                           0.62     20000
   macro avg       0.62      0.62      0.62     20000
weighted avg       0.62      0.62      0.62     20000

ROC-AUC Score: 0.6181321725286902


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#Deep Learning Models (Neural Networks)

In [None]:
from sklearn.preprocessing import FunctionTransformer

# Assuming you have already imported necessary libraries and defined categorical_cols and numerical_cols

# A custom transformer function to convert sparse arrays to dense
def to_dense(tensor):
    if isinstance(tensor, sparse.spmatrix):
        return tensor.toarray()
    else:
        return tensor

# Update the categorical transformer to include to_dense
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
    ('to_dense', FunctionTransformer(to_dense, accept_sparse=True))
])

# Proceed with the ColumnTransformer as before


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import scipy.sparse as sparse

# Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Convert your data to numpy arrays if they are not already
X_train_np = np.array(X_train.toarray()) if isinstance(X_train, sparse.csr_matrix) else np.array(X_train)
y_train_np = np.array(y_train)

X_val_np = np.array(X_val.toarray()) if isinstance(X_val, sparse.csr_matrix) else np.array(X_val)
y_val_np = np.array(y_val)

# Train the model
history = model.fit(X_train_np, y_train_np, epochs=10, batch_size=256, validation_data=(X_val_np, y_val_np))


In [None]:
import scipy.sparse as sparse

# Convert X_val to a dense array if it's in sparse format
if isinstance(X_val, sparse.spmatrix):
    X_val_dense = X_val.toarray()
else:
    X_val_dense = X_val


In [None]:
from sklearn.metrics import classification_report

# Predict on the dense validation set
y_pred = model.predict(X_val_dense)
y_pred = (y_pred > 0.5).astype("int32")  # Convert probabilities to binary predictions

# Generate classification report
print(classification_report(y_val, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.61      0.71      0.66      4973
           1       0.66      0.55      0.60      5027

    accuracy                           0.63     10000
   macro avg       0.63      0.63      0.63     10000
weighted avg       0.63      0.63      0.63     10000

ROC-AUC Score: 0.6307131315949173


# XGBoost (Extreme Gradient Boosting)

In [None]:

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer


# Load a subset of your dataset
n_rows = 100000  # Adjust this number based on your RAM capacity
train = pd.read_csv('train.csv', dtype=dtypes, nrows=n_rows, low_memory=True)

# Separate features and target variable
X = train.drop('HasDetections', axis=1)
y = train['HasDetections']

# Preprocess data
numerical_cols = X.select_dtypes(include=['int8', 'float16', 'float32', 'int16']).columns
categorical_cols = X.select_dtypes(include=['category', 'object']).columns

# Create transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values in numerical data
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values in categorical data
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Reduce the number of features using feature selection
# Adjust 'k' to select the number of top features you want to keep
feature_selector = SelectKBest(f_classif, k=100)

# Splitting the dataset into the Training set and Test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selector', feature_selector),
                           ('classifier', xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'))])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
val_accuracy = pipeline.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.2f}")


  f = msb / msw


Validation Accuracy: 0.64


In [None]:
# Predict on the validation set
y_val_pred = pipeline.predict(X_val)

# Generate the classification report
report = classification_report(y_val, y_val_pred, target_names=['Class 0', 'Class 1'])
print(report)


              precision    recall  f1-score   support

     Class 0       0.64      0.62      0.63      9969
     Class 1       0.63      0.65      0.64     10031

    accuracy                           0.64     20000
   macro avg       0.64      0.64      0.64     20000
weighted avg       0.64      0.64      0.64     20000



# scikit-learn Gradient Boosting Classifier with PCA, feature selection

In [None]:
# import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# prepare data
subset = train.sample(n=500000, random_state=42)
X = subset.drop('HasDetections', axis=1)
Y = subset['HasDetections']

# Preprocess data
numerical_cols = X.select_dtypes(include=['int8', 'float16', 'float32', 'int16']).columns
categorical_cols = X.select_dtypes(include=['category', 'object']).columns

# Splitting the dataset into the Training set and Test set
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values in numerical data
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values in categorical data
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# No PCA or feature selection

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', HistGradientBoostingClassifier(class_weight='balanced'))])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
val_accuracy = pipeline.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Predict on the validation set
y_val_pred = pipeline.predict(X_val)

# Generate the classification report
report = classification_report(y_val, y_val_pred, target_names=['Class 0', 'Class 1'])
print(report)
print("ROC AUC score: ", round(roc_auc_score(y_val, y_val_pred), 3))

Validation Accuracy: 0.65
              precision    recall  f1-score   support

     Class 0       0.65      0.66      0.65     49902
     Class 1       0.65      0.64      0.65     50098

    accuracy                           0.65    100000
   macro avg       0.65      0.65      0.65    100000
weighted avg       0.65      0.65      0.65    100000

ROC AUC score:  0.648


In [None]:
# With PCA
n_components = 70

# Create a pipeline
pipeline_pca = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pca', PCA(n_components=n_components)),
                           ('classifier', HistGradientBoostingClassifier(class_weight='balanced'))])

# Train the model
pipeline_pca.fit(X_train, y_train)

# Evaluate the model
val_accuracy_pca = pipeline_pca.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy_pca:.2f}")

# Predict on the validation set
y_val_pred_pca = pipeline_pca.predict(X_val)

# Generate the classification report
report_pca = classification_report(y_val, y_val_pred_pca, target_names=['Class 0', 'Class 1'])
print(report_pca)
print("ROC AUC score: ", round(roc_auc_score(y_val, y_val_pred_pca), 3))

Validation Accuracy: 0.64
              precision    recall  f1-score   support

     Class 0       0.63      0.65      0.64     49902
     Class 1       0.64      0.62      0.63     50098

    accuracy                           0.64    100000
   macro avg       0.64      0.64      0.64    100000
weighted avg       0.64      0.64      0.64    100000

ROC AUC score:  0.636


In [None]:
# With feature selection
feature_selector = SelectKBest(f_classif, k=50)
pipeline_with_feature_selector = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selector', feature_selector),
                           ('classifier', HistGradientBoostingClassifier(class_weight='balanced'))])
pipeline_with_feature_selector.fit(X_train, y_train)

# Evaluate the model
val_accuracy_feature_selection = pipeline_with_feature_selector.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy_feature_selection:.2f}")

# Predict on the validation set
y_val_pred_feature_selection = pipeline_with_feature_selector.predict(X_val)

# Generate the classification report
report_feature_selection = classification_report(y_val, y_val_pred_feature_selection, target_names=['Class 0', 'Class 1'])
print(report_feature_selection)
print("ROC AUC score: ", round(roc_auc_score(y_val, y_val_pred_feature_selection), 3))

  f = msb / msw


Validation Accuracy: 0.65
              precision    recall  f1-score   support

     Class 0       0.64      0.65      0.65     49902
     Class 1       0.65      0.64      0.64     50098

    accuracy                           0.65    100000
   macro avg       0.65      0.65      0.65    100000
weighted avg       0.65      0.65      0.65    100000

ROC AUC score:  0.646
