In [1]:
import pandas as pd
from xgboost import XGBClassifier
import numpy as np

In [2]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        }

In [3]:
test_dir=r'C:\Users\jithi\OneDrive\Desktop\ML project\microsoft-malware-prediction\test.csv'
test = pd.read_csv(test_dir,dtype=dtypes)

In [4]:
X=test.drop(columns=['MachineIdentifier'])
y = test['MachineIdentifier']
del test

In [5]:
cols_to_bin=['Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']

In [6]:
import pandas as pd
import numpy as np
import pickle
import category_encoders as ce

# Load the saved encoders
scaler_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\scaler.pkl'
freq_encoder_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\freq_encoder.pkl'
label_encoders_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\label_encoders.pkl'

# Load the saved objects
scaler = pickle.load(open(scaler_dir, 'rb'))
freq_encoder = pickle.load(open(freq_encoder_dir, 'rb'))
label_encoders = pickle.load(open(label_encoders_dir, 'rb'))

# Load the test dataset (Assuming it's already defined as `X`)

# Apply frequency encoding using the loaded encoder
columns_to_freq_encode = ['AppVersion', 'AvSigVersion', 'Census_OSVersion', 'EngineVersion', 'OsBuildLab']
X[columns_to_freq_encode] = freq_encoder.transform(X[columns_to_freq_encode])

# Apply optimized label encoding
columns_for_LE = ['ProductName', 'RtpStateBitfield', 'Platform', 'Processor', 'OsVer', 'OsBuild', 'OsSuite',
                  'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'PuaMode', 'SmartScreen', 'Census_MDC2FormFactor',
                  'Census_DeviceFamily', 'Census_ProcessorClass', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName',
                  'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSArchitecture',
                  'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName',
                  'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel',
                  'Census_FlightRing']

for col in columns_for_LE:
    if col in X.columns and col in label_encoders:
        le = label_encoders[col]
        le_dict = {cls: idx for idx, cls in enumerate(le.classes_)}
        
        if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
            X[col] = X[col].cat.add_categories([-1]).map(le_dict).fillna(-1).astype('int8')
        else:
            X[col] = X[col].map(le_dict).fillna(-1).astype('int8')

# Handle missing values
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna('NA')
    elif X[col].dtype == 'category':
        X[col] = X[col].cat.add_categories('NA').fillna('NA')
    else:
        X[col] = X[col].fillna(-1)

# Optimize integer and float columns
def optimize_dataframe(df):
    for col in df.columns:
        col_data = df[col]

        # Skip non-numeric columns
        if not np.issubdtype(col_data.dtype, np.number):
            continue

        # Check if all values are integers (i.e., no decimal part)
        if np.all(col_data.dropna() == col_data.dropna().astype(int)):
            df[col] = pd.to_numeric(col_data, downcast="integer")
        else:
            df[col] = pd.to_numeric(col_data, downcast="float")
    
    return df

X = optimize_dataframe(X)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category first
  if pd.api.types.is_categorical_dtype(X[col]):  # If categorical, add '-1' category fir

In [14]:
# xgboost_dir=r'C:\Users\jithi\OneDrive\Desktop\ML project\xgb_model.json'
# xgb_loaded = XGBClassifier(enable_categorical=True)
# xgb_loaded.load_model(xgboost_dir)
# predictions = xgb_loaded.predict(X)
# probs = xgb_loaded.predict_proba(X_scaled)
# probs = probs[:, 1]

In [15]:
# probs = np.ravel(probs)

# submission = pd.DataFrame(
#     {'MachineIdentifier': y, 'HasDetections': probs})
# submission.to_csv('submission.csv', index=False)

In [8]:
#test loader on X
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
X_test_tensor = torch.tensor(X.values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
#load model
model_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\model_weights.pth'
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(X_test_tensor.shape[1], 64)  # Input layer to first hidden layer
        self.fc2 = nn.Linear(64, 1)  # First hidden layer to output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x
 
model=MLP()
model.load_state_dict(torch.load(model_dir))

criterion = nn.BCELoss()  # Binary Cross-Entropy loss


  model.load_state_dict(torch.load(model_dir))


In [10]:
#predict on test set
model.eval()
predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        inputs = data[0]
        outputs = model(inputs)
        predictions.append(outputs.numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

  0%|          | 0/3835 [00:00<?, ?it/s]

100%|██████████| 3835/3835 [01:05<00:00, 58.93it/s]


In [11]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('submission.csv', index=False)

In [16]:
#save as csv
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns, index=X.index)

X_scaled.to_csv(r'C:\Users\jithi\OneDrive\Desktop\ML project\microsoft-malware-prediction\test_data_scaled_encoded_shrunk.csv', index=False)

In [15]:
pca_features = [
    "Census_OSBuildNumber",
    "OsBuild",
    "Census_InternalBatteryType",
    "Census_InternalBatteryNumberOfCharges",
    "Census_IsWIMBootEnabled",
    "Census_ThresholdOptIn",
    "OsVer",
    "Platform",
    "Census_ChassisTypeName",
    "Census_MDC2FormFactor",
    "Census_InternalPrimaryDisplayResolutionVertical",
    "Census_OSSkuName",
    "AppVersion",
    "Census_OSEdition",
    "OsSuite",
    "Census_InternalPrimaryDisplayResolutionHorizontal",
    "Census_InternalPrimaryDiagonalDisplaySizeInInches",
    "SkuEdition",
    "OsBuildLab",
    "Census_OSVersion",
    "SmartScreen",
    "Census_TotalPhysicalRAM",
    "Census_OSBranch",
    "Census_PowerPlatformRoleName",
    "ProductName",
    "Census_IsFlightingInternal",
    "HasTpm",
    "Census_ProcessorCoreCount",
    "Census_OSBuildRevision",
    "OsPlatformSubRelease",
    "Census_IsFlightsDisabled",
    "Census_FirmwareManufacturerIdentifier",
    "Census_IsSecureBootEnabled",
    "Firewall",
    "Census_FlightRing",
    "Census_PrimaryDiskTypeName",
    "EngineVersion",
    "IeVerIdentifier",
    "Census_ProcessorModelIdentifier",
    "Census_FirmwareVersionIdentifier",
    "Census_ProcessorClass",
    "AVProductsInstalled",
    "Wdft_IsGamer",
    "Census_OEMNameIdentifier",
    "IsProtected",
    "Processor",
    "Census_OSArchitecture",
    "AVProductStatesIdentifier",
    "SMode",
    "Census_OEMModelIdentifier",
    "Wdft_RegionIdentifier",
    "Census_OSInstallLanguageIdentifier",
    "DefaultBrowsersIdentifier",
    "AVProductsEnabled",
    "Census_GenuineStateName",
    "AvSigVersion",
    "Census_OSWUAutoUpdateOptionsName"
]
X_pca=X[pca_features]
#test loader on X
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
X_pca_test_tensor = torch.tensor(X_pca.values, dtype=torch.float32)
test_dataset = TensorDataset(X_pca_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
lda_features = ['Census_OSEdition', 'Census_OSSkuName', 'AVProductsInstalled', 'EngineVersion', 'SmartScreen', 'AVProductStatesIdentifier', 'Census_ThresholdOptIn', 'IsProtected', 'Census_OSInstallTypeName', 'Census_IsVirtualDevice', 'Census_InternalPrimaryDisplayResolutionVertical', 'OsSuite', 'Census_InternalPrimaryDisplayResolutionHorizontal', 'Processor', 'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer', 'Census_OSVersion', 'Census_OSUILocaleIdentifier', 'Census_InternalBatteryNumberOfCharges', 'Census_IsTouchEnabled', 'Census_OSBuildNumber', 'HasTpm', 'SMode', 'Census_MDC2FormFactor', 'Census_OSInstallLanguageIdentifier', 'Census_DeviceFamily', 'Census_HasOpticalDiskDrive', 'Census_TotalPhysicalRAM', 'Census_InternalBatteryType', 'Census_PrimaryDiskTypeName', 'LocaleEnglishNameIdentifier', 'ProductName', 'Census_ProcessorCoreCount', 'Wdft_RegionIdentifier', 'DefaultBrowsersIdentifier', 'Census_IsSecureBootEnabled', 'Census_GenuineStateName', 'RtpStateBitfield', 'OsBuild', 'SkuEdition', 'OsPlatformSubRelease', 'Census_IsFlightingInternal', 'Census_FlightRing', 'OsVer', 'Census_ProcessorModelIdentifier', 'Census_ActivationChannel', 'IeVerIdentifier', 'Census_OSBuildRevision', 'AvSigVersion', 'AppVersion', 'Census_OSWUAutoUpdateOptionsName', 'Census_OEMNameIdentifier', 'Census_OSArchitecture', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'PuaMode', 'Census_OEMModelIdentifier', 'Census_ProcessorClass', 'Census_ProcessorManufacturerIdentifier', 'Census_IsFlightsDisabled', 'Census_OSBranch', 'AVProductsEnabled', 'Census_SystemVolumeTotalCapacity', 'CityIdentifier', 'Census_IsPortableOperatingSystem', 'IsSxsPassiveMode', 'Census_FirmwareManufacturerIdentifier', 'OrganizationIdentifier', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'AutoSampleOptIn', 'OsBuildLab', 'Census_FirmwareVersionIdentifier', 'CountryIdentifier', 'UacLuaenable', 'IsBeta', 'Firewall', 'Census_IsPenCapable', 'Census_PrimaryDiskTotalCapacity', 'Platform', 'Census_IsWIMBootEnabled', 'GeoNameIdentifier']
X_lda=X[lda_features]
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
X_lda_test_tensor = torch.tensor(X_lda.values, dtype=torch.float32)
test_dataset = TensorDataset(X_lda_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\lda_filtered_simple_nn.pth'

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(X_lda_test_tensor.shape[1], 64)  # Input layer to first hidden layer
        self.fc2 = nn.Linear(64, 1)  # First hidden layer to output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x
 
model=MLP()
model.load_state_dict(torch.load(model_dir))

criterion = nn.BCELoss()  # Binary Cross-Entropy loss
model.eval()
predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        inputs = data[0]
        outputs = model(inputs)
        predictions.append(outputs.numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('submission.csv', index=False)

  model.load_state_dict(torch.load(model_dir))
100%|██████████| 3835/3835 [00:50<00:00, 75.35it/s]


In [21]:
model_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\best_model_100_weights.pth'
#load model
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(X_pca_test_tensor.shape[1], 64)  # Input layer to first hidden layer
        self.fc2 = nn.Linear(64, 1)  # First hidden layer to output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x
 
model=MLP()
model.load_state_dict(torch.load(model_dir))

criterion = nn.BCELoss()  # Binary Cross-Entropy loss


  model.load_state_dict(torch.load(model_dir))


In [22]:
#predict on test set
model.eval()
predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        inputs = data[0]
        outputs = model(inputs)
        predictions.append(outputs.numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████| 3835/3835 [01:04<00:00, 59.51it/s]


In [23]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('submission.csv', index=False)

In [1]:
from xgboost import XGBClassifier
xgboost_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\Saved Models\xgboost_intial_encodings.json'
xgb_loaded = XGBClassifier(enable_categorical=True)
xgb_loaded.load_model(xgboost_dir)


In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        }
import pandas as pd
test_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\microsoft-malware-prediction\test.csv'

test = pd.read_csv(test_dir, dtype=dtypes)

In [4]:
identiers = test['MachineIdentifier']
test.drop(columns=['MachineIdentifier'], inplace=True)

In [5]:
probs = xgb_loaded.predict_proba(test)
probs = probs[:, 1]

In [6]:
submission = pd.DataFrame(
    {'MachineIdentifier': identiers, 'HasDetections': probs})
submission.to_csv('xgboost initial encodings.csv', index=False)

In [None]:
from collections import Counter
import numpy as np
Counter(np.round(probs))

Counter({0.0: 7201839, 1.0: 651414})

: 