In [1]:
import pandas as pd
from xgboost import XGBClassifier
import numpy as np

In [2]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        }

In [4]:
test_dir=r'C:\Users\jithi\OneDrive\Desktop\ML project\microsoft-malware-prediction\test.csv'
test = pd.read_csv(test_dir,dtype=dtypes)

In [5]:
X=test.drop(columns=['MachineIdentifier'])
y = test['MachineIdentifier']
del test

In [6]:
columns_to_split = ['EngineVersion','AppVersion','AvSigVersion','Census_OSVersion']
columns_for_OE = ['ProductName','RtpStateBitfield','Platform','Processor','OsVer','OsBuild','OsSuite','OsPlatformSubRelease','OsBuildLab','SkuEdition','PuaMode','SmartScreen','Census_MDC2FormFactor',
                  'Census_DeviceFamily','Census_ProcessorClass','Census_PrimaryDiskTypeName','Census_ChassisTypeName','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical',
                  'Census_PowerPlatformRoleName','Census_InternalBatteryType','Census_OSArchitecture','Census_OSBranch','Census_OSEdition','Census_OSSkuName','Census_OSInstallTypeName','Census_OSWUAutoUpdateOptionsName','Census_GenuineStateName','Census_ActivationChannel',
                   'Census_FlightRing',
                  ]
cols_to_bin=['Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']
for col in columns_to_split:
    split_cols = X[col].str.split('.', expand=True)  
    split_cols.columns = [f"{col}_{i+1}" for i in range(split_cols.shape[1])]  
    X = X.drop(columns=[col]).join(split_cols)  

In [7]:
#load scaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pickle
scaler_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\scaler.pkl'
ordinal_dir = r'C:\Users\jithi\OneDrive\Desktop\ML project\ordinal_encoder.pkl'
scaler = pickle.load(open(scaler_dir, 'rb'))
ordinal = pickle.load(open(ordinal_dir, 'rb'))
X[columns_for_OE] = ordinal.transform(X[columns_for_OE])
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('float32')
def optimize_dataframe(df):
    for col in df.columns:
        col_data = df[col]
        
        # Skip non-numeric columns
        if not np.issubdtype(col_data.dtype, np.number):
            continue

        # Check if all values are integers (i.e., no decimal part)
        if np.all(col_data.dropna() == col_data.dropna().astype(int)):
            df[col] = pd.to_numeric(col_data, downcast="integer")
        else:
            df[col] = pd.to_numeric(col_data, downcast="float")
    
    return df
X=optimize_dataframe(X)
X_scaled = scaler.transform(X)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7853253 entries, 0 to 7853252
Data columns (total 93 columns):
 #   Column                                             Dtype  
---  ------                                             -----  
 0   ProductName                                        int8   
 1   IsBeta                                             int8   
 2   RtpStateBitfield                                   float64
 3   IsSxsPassiveMode                                   int8   
 4   DefaultBrowsersIdentifier                          float32
 5   AVProductStatesIdentifier                          float32
 6   AVProductsInstalled                                float32
 7   AVProductsEnabled                                  float32
 8   HasTpm                                             int8   
 9   CountryIdentifier                                  int16  
 10  CityIdentifier                                     float32
 11  OrganizationIdentifier                            

In [None]:
xgboost_dir=r'C:\Users\jithi\OneDrive\Desktop\ML project\xgb_model.json'
xgb_loaded = XGBClassifier(enable_categorical=True)
xgb_loaded.load_model(xgboost_dir)
predictions = xgb_loaded.predict(X)
probs = xgb_loaded.predict_proba(X_scaled)
probs = probs[:, 1]

In [20]:
probs = np.ravel(probs)

submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('submission.csv', index=False)