In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        }

In [3]:
test_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/data/test.csv"
test = pd.read_csv(test_dir,dtype=dtypes)

In [4]:
#for each column print data type and 2 sample values
# for col in test.columns:
#     print(f"{col}: {test[col].dtype}, {test[col].sample(2).values}")

In [5]:
import numpy as np 
import pandas as pd 
from datetime import datetime as dt
import os
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import time
import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
import gc
from tqdm import tqdm_notebook

In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

In [8]:
test = reduce_mem_usage(test)

Mem. usage decreased to 2049.02 Mb (16.5% reduction)


In [9]:
toDrop=['IsBeta','AutoSampleOptIn','Census_DeviceFamily',
        'Census_ProcessorManufacturerIdentifier','ProductName',
        'Census_IsPortableOperatingSystem','OsVer','Census_IsWIMBootEnabled',
        'Platform', 'PuaMode', 'Census_IsPenCapable', 'Census_IsFlightsDisabled',
        'Census_ProcessorClass', 'Census_IsFlightingInternal', 'Census_InternalBatteryType',
        'Census_IsAlwaysOnAlwaysConnectedCapable', 'UacLuaenable', 'Census_OSBuildNumber',
        'Census_OSArchitecture', 'Census_IsTouchEnabled', 'Census_ThresholdOptIn', 'Census_ThresholdOptIn', 'SMode',
        'HasTpm'
       ]
test.drop(toDrop, axis=1, inplace=True)

In [10]:
def rename_edition(x):
    x = x.lower()
    if 'core' in x:
        return 'Core'
    elif 'pro' in x:
        return 'pro'
    elif 'enterprise' in x:
        return 'Enterprise'
    elif 'server' in x:
        return 'Server'
    elif 'home' in x:
        return 'Home'
    elif 'education' in x:
        return 'Education'
    elif 'cloud' in x:
        return 'Cloud'
    else:
        return x
    
def encode_categorical_columns(x_train, columns, to_drop=None, sort=True):
    from tqdm import tqdm
    import pandas as pd

    to_drop = to_drop if to_drop is not None else []

    print("Training length:", x_train.shape[0])

    for col in tqdm(columns):
        if col in to_drop or col in ['MachineIdentifier', 'HasDetections']:
            continue

        # Factorize only on train data
        x_train[col], uniques = pd.factorize(x_train[col], sort=sort)
        x_train[col] = x_train[col].astype('int32')

        # Create a mapping dictionary from unique values
        category_map = {cat: idx for idx, cat in enumerate(uniques)}

        # Map test data using the train-derived mapping

        # Optional: replace missing (unseen categories in test) with -1 or 0
        print(f"{col} encoded. Train shape: {x_train.shape}")

    return x_train,category_map
    
def encode_categorical_columns_test(x_test, columns, category_map):
    from tqdm import tqdm
    import pandas as pd

    for col in tqdm(columns):
        if col in ['MachineIdentifier']:
            continue

        # Map test data using the train-derived mapping
        x_test[col] = x_test[col].map(category_map)
    return x_test


one_hot = {}
def add_features(df):
    global one_hot
    df['primary_drive_c_ratio'] = df['Census_SystemVolumeTotalCapacity']/ df['Census_PrimaryDiskTotalCapacity'];
    df['primary_drive_c_ratio'] = df['primary_drive_c_ratio'].astype('float32')
    
    df['non_primary_drive_MB'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity'];
    df['non_primary_drive_MB'] = df['non_primary_drive_MB'].astype('float32')
    
    df['aspect_ratio'] = df['Census_InternalPrimaryDisplayResolutionHorizontal']/ df['Census_InternalPrimaryDisplayResolutionVertical'];
    df['aspect_ratio'] = df['aspect_ratio'].astype('float16')

    df['dpi'] = ((df['Census_InternalPrimaryDisplayResolutionHorizontal'].astype('float32')**2 + df['Census_InternalPrimaryDisplayResolutionVertical'].astype('float32')**2)**.5)/(df['Census_InternalPrimaryDiagonalDisplaySizeInInches'].astype('float32'));
    df['dpi']=df['dpi'].astype('float32')

    df['Screen_Area'] = (df['aspect_ratio']* (df['Census_InternalPrimaryDiagonalDisplaySizeInInches']**2))/(df['aspect_ratio']**2 + 1);
    df['Screen_Area'] = df['Screen_Area'].astype('float32')

    df['AVProductsInstalled_slim']= df['AVProductsInstalled']
    df.loc[df['AVProductsInstalled'].isin([1, 2]) == False, 'AVProductsInstalled_slim'] = 3


    top_15 = df['AVProductStatesIdentifier'].value_counts(dropna=False, normalize=True).cumsum().index[:15]
    df['magic_4'] = 0
    df.loc[df['AVProductStatesIdentifier'].isin(top_15) == True, 'magic_4'] = 1
    
    df['Census_OSEdition'] = df['Census_OSEdition'].astype(str)
    df['Census_OSEdition'] = df['Census_OSEdition'].apply(rename_edition)
    df['Census_OSEdition'] = df['Census_OSEdition'].astype('category')

    return df

In [11]:
test['SmartScreen'].replace({ '0':'off', '00000000':'off','BLOCK':'Block', 'off':'Off', 
                              'on':'On', 'requireadmin' : 'RequireAdmin', 'Enabled' : 'on', 
                              'OFF' : 'Off', 'Promt' : 'Prompt', 'prompt' : 'Prompt', 'on' : 'On', 
                              'off' : 'Off', 'warn' : 'RequireAdmin', 'requireAdmin' : 'RequireAdmin', '&#x03;' : '&#x01;'}, inplace=True)


test.head()
gc.collect()
test=add_features(test)
test,category_map= encode_categorical_columns(test,  categorical_columns,to_drop=toDrop)
test = reduce_mem_usage(test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['SmartScreen'].replace({ '0':'off', '00000000':'off','BLOCK':'Block', 'off':'Off',
  test['SmartScreen'].replace({ '0':'off', '00000000':'off','BLOCK':'Block', 'off':'Off',


Training length: 7853253


 10%|████████▋                                                                              | 3/30 [00:00<00:01, 23.38it/s]

EngineVersion encoded. Train shape: (7853253, 66)


 20%|█████████████████▍                                                                     | 6/30 [00:00<00:01, 21.52it/s]

AppVersion encoded. Train shape: (7853253, 66)
AvSigVersion encoded. Train shape: (7853253, 66)
Processor encoded. Train shape: (7853253, 66)


 30%|██████████████████████████                                                             | 9/30 [00:00<00:01, 20.16it/s]

OsPlatformSubRelease encoded. Train shape: (7853253, 66)
OsBuildLab encoded. Train shape: (7853253, 66)


 40%|██████████████████████████████████▍                                                   | 12/30 [00:00<00:00, 20.67it/s]

SkuEdition encoded. Train shape: (7853253, 66)


 50%|███████████████████████████████████████████                                           | 15/30 [00:00<00:00, 18.31it/s]

SmartScreen encoded. Train shape: (7853253, 66)
Census_MDC2FormFactor encoded. Train shape: (7853253, 66)
Census_PrimaryDiskTypeName encoded. Train shape: (7853253, 66)


 60%|███████████████████████████████████████████████████▌                                  | 18/30 [00:00<00:00, 19.14it/s]

Census_ChassisTypeName encoded. Train shape: (7853253, 66)
Census_PowerPlatformRoleName encoded. Train shape: (7853253, 66)


 70%|████████████████████████████████████████████████████████████▏                         | 21/30 [00:01<00:00, 19.70it/s]

Census_OSVersion encoded. Train shape: (7853253, 66)


 80%|████████████████████████████████████████████████████████████████████▊                 | 24/30 [00:01<00:00, 19.85it/s]

Census_OSBranch encoded. Train shape: (7853253, 66)
Census_OSEdition encoded. Train shape: (7853253, 66)
Census_OSSkuName encoded. Train shape: (7853253, 66)


 90%|█████████████████████████████████████████████████████████████████████████████▍        | 27/30 [00:01<00:00, 17.50it/s]

Census_OSInstallTypeName encoded. Train shape: (7853253, 66)
Census_OSWUAutoUpdateOptionsName encoded. Train shape: (7853253, 66)
Census_GenuineStateName encoded. Train shape: (7853253, 66)


100%|██████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 18.33it/s]

Census_ActivationChannel encoded. Train shape: (7853253, 66)
Census_FlightRing encoded. Train shape: (7853253, 66)





Mem. usage decreased to 1950.61 Mb (20.5% reduction)


In [12]:
# Replace NaNs in other columns
for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna('NA')
    elif test[col].dtype == 'category':
        test[col] = test[col].cat.add_categories('NA').fillna('NA')
    else:
        test[col] = test[col].fillna(-1)

In [13]:
columns_to_freq_encode = ['AppVersion','AvSigVersion','Census_OSVersion','EngineVersion','OsBuildLab']

with open('/Users/dragonfruit/cs6140-ml-workspace/ML-project/Encoder pickle Files/Kaggle Encodings/freq_encoder_kaggle.pkl', 'rb') as f:
    freq_encoder = pickle.load(f)

# Then apply to test data
test[columns_to_freq_encode] = freq_encoder.transform(test[columns_to_freq_encode])

In [14]:
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd

columns_for_LE = ['ProductName','RtpStateBitfield','Platform','Processor','OsVer','OsBuild','OsSuite',
                   'OsPlatformSubRelease','OsBuildLab','SkuEdition','PuaMode','SmartScreen','Census_MDC2FormFactor',
                  'Census_DeviceFamily','Census_ProcessorClass','Census_PrimaryDiskTypeName','Census_ChassisTypeName',
                  'Census_PowerPlatformRoleName','Census_InternalBatteryType','Census_OSArchitecture',
                  'Census_OSBranch','Census_OSEdition','Census_OSSkuName','Census_OSInstallTypeName',
                  'Census_OSWUAutoUpdateOptionsName','Census_GenuineStateName','Census_ActivationChannel',
                  'Census_FlightRing']

with open('/Users/dragonfruit/cs6140-ml-workspace/ML-project/Encoder pickle Files/Kaggle Encodings/label_encoders_kaggle.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# Process each column
for col in tqdm(columns_for_LE, desc="Encoding columns"):
    if col not in test.columns:
        continue
    
    le = label_encoders[col]
    
    # Process in batches for memory efficiency
    # First, get unique values (dramatically reduces processing time)
    unique_vals = test[col].unique()
    print(f"Column {col}: Processing {len(unique_vals)} unique values instead of {len(test[col])} total values")
    
    # Create mapping dictionary for unique values
    mapping = {}
    for val in tqdm(unique_vals, desc=f"Mapping {col} values"):
        try:
            mapping[val] = le.transform([val])[0]
        except ValueError:
            mapping[val] = -1
    
    # Create a new array filled with default values
    result = np.full(len(test), -1, dtype=np.int32)
    
    # Apply mapping using numpy for speed
    for key, encoded_val in mapping.items():
        # Find all instances of this value and set them at once
        result[test[col] == key] = encoded_val
    
    # Assign processed values back to dataframe
    test[col] = result

Encoding columns:   0%|                                                                             | 0/28 [00:00<?, ?it/s]

Column RtpStateBitfield: Processing 9 unique values instead of 7853253 total values



Mapping RtpStateBitfield values: 100%|█████████████████████████████████████████████████████| 9/9 [00:00<00:00, 4147.30it/s][A
Encoding columns:   7%|████▉                                                                | 2/28 [00:00<00:04,  5.59it/s]

Column Processor: Processing 3 unique values instead of 7853253 total values



Mapping Processor values: 100%|████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 3605.42it/s][A


Column OsBuild: Processing 78 unique values instead of 7853253 total values



Mapping OsBuild values: 100%|████████████████████████████████████████████████████████████| 78/78 [00:00<00:00, 7917.99it/s][A
Encoding columns:  21%|██████████████▊                                                      | 6/28 [00:00<00:02,  8.70it/s]

Column OsSuite: Processing 13 unique values instead of 7853253 total values



Mapping OsSuite values: 100%|████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 4973.18it/s][A
Encoding columns:  25%|█████████████████▎                                                   | 7/28 [00:00<00:02,  8.19it/s]

Column OsPlatformSubRelease: Processing 9 unique values instead of 7853253 total values



Mapping OsPlatformSubRelease values: 100%|█████████████████████████████████████████████████| 9/9 [00:00<00:00, 4235.24it/s][A
Encoding columns:  29%|███████████████████▋                                                 | 8/28 [00:01<00:02,  7.39it/s]

Column OsBuildLab: Processing 371 unique values instead of 7853253 total values



Mapping OsBuildLab values: 100%|███████████████████████████████████████████████████████| 371/371 [00:00<00:00, 5961.29it/s][A
Encoding columns:  32%|██████████████████████▏                                              | 9/28 [00:02<00:08,  2.25it/s]

Column SkuEdition: Processing 8 unique values instead of 7853253 total values



Mapping SkuEdition values: 100%|███████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3809.54it/s][A
Encoding columns:  36%|████████████████████████▎                                           | 10/28 [00:02<00:06,  2.74it/s]

Column SmartScreen: Processing 16 unique values instead of 7853253 total values



Mapping SmartScreen values: 100%|████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 4479.30it/s][A
Encoding columns:  43%|█████████████████████████████▏                                      | 12/28 [00:02<00:03,  4.07it/s]

Column Census_MDC2FormFactor: Processing 14 unique values instead of 7853253 total values



Mapping Census_MDC2FormFactor values: 100%|██████████████████████████████████████████████| 14/14 [00:00<00:00, 5884.97it/s][A
Encoding columns:  46%|███████████████████████████████▌                                    | 13/28 [00:02<00:03,  4.41it/s]

Column Census_PrimaryDiskTypeName: Processing 5 unique values instead of 7853253 total values



Mapping Census_PrimaryDiskTypeName values: 100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 3066.01it/s][A
Encoding columns:  57%|██████████████████████████████████████▊                             | 16/28 [00:03<00:01,  7.27it/s]

Column Census_ChassisTypeName: Processing 49 unique values instead of 7853253 total values



Mapping Census_ChassisTypeName values: 100%|█████████████████████████████████████████████| 49/49 [00:00<00:00, 5418.57it/s][A


Column Census_PowerPlatformRoleName: Processing 11 unique values instead of 7853253 total values



Mapping Census_PowerPlatformRoleName values: 100%|███████████████████████████████████████| 11/11 [00:00<00:00, 3442.06it/s][A
Encoding columns:  64%|███████████████████████████████████████████▋                        | 18/28 [00:03<00:01,  6.20it/s]

Column Census_OSBranch: Processing 29 unique values instead of 7853253 total values



Mapping Census_OSBranch values: 100%|████████████████████████████████████████████████████| 29/29 [00:00<00:00, 4231.07it/s][A
Encoding columns:  75%|███████████████████████████████████████████████████                 | 21/28 [00:03<00:00,  7.97it/s]

Column Census_OSEdition: Processing 11 unique values instead of 7853253 total values



Mapping Census_OSEdition values: 100%|███████████████████████████████████████████████████| 11/11 [00:00<00:00, 4144.57it/s][A


Column Census_OSSkuName: Processing 31 unique values instead of 7853253 total values



Mapping Census_OSSkuName values: 100%|███████████████████████████████████████████████████| 31/31 [00:00<00:00, 5181.45it/s][A
Encoding columns:  82%|███████████████████████████████████████████████████████▊            | 23/28 [00:04<00:00,  7.03it/s]

Column Census_OSInstallTypeName: Processing 9 unique values instead of 7853253 total values



Mapping Census_OSInstallTypeName values: 100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 3413.40it/s][A
Encoding columns:  86%|██████████████████████████████████████████████████████████▎         | 24/28 [00:04<00:00,  6.59it/s]

Column Census_OSWUAutoUpdateOptionsName: Processing 6 unique values instead of 7853253 total values



Mapping Census_OSWUAutoUpdateOptionsName values: 100%|█████████████████████████████████████| 6/6 [00:00<00:00, 5058.46it/s][A
Encoding columns:  89%|████████████████████████████████████████████████████████████▋       | 25/28 [00:04<00:00,  6.50it/s]

Column Census_GenuineStateName: Processing 6 unique values instead of 7853253 total values



Mapping Census_GenuineStateName values: 100%|██████████████████████████████████████████████| 6/6 [00:00<00:00, 4958.78it/s][A
Encoding columns:  93%|███████████████████████████████████████████████████████████████▏    | 26/28 [00:04<00:00,  6.98it/s]

Column Census_ActivationChannel: Processing 6 unique values instead of 7853253 total values



Mapping Census_ActivationChannel values: 100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 5263.72it/s][A
Encoding columns:  96%|█████████████████████████████████████████████████████████████████▌  | 27/28 [00:04<00:00,  6.81it/s]

Column Census_FlightRing: Processing 11 unique values instead of 7853253 total values



Mapping Census_FlightRing values: 100%|██████████████████████████████████████████████████| 11/11 [00:00<00:00, 5854.99it/s][A
Encoding columns: 100%|████████████████████████████████████████████████████████████████████| 28/28 [00:04<00:00,  5.84it/s]


In [15]:
# Convert float64 and object columns to float32
for col in test.columns:
    if test[col].dtype == 'float64':
        test[col] = test[col].astype('float32')
    elif test[col].dtype == 'object':
        test[col] = test[col].astype('float32')

In [16]:
# Load the scaler from the pickle file
with open('/Users/dragonfruit/cs6140-ml-workspace/ML-project/Encoder pickle Files/Kaggle Encodings/scaler_kaggle.pkl', 'rb') as f:
    scaler = pickle.load(f)

y = test['MachineIdentifier']
test = test.drop(columns=['MachineIdentifier'])

test.replace([np.inf, -np.inf], -1, inplace=True)
scaled_test = scaler.transform(test)

In [17]:
test_data = pd.DataFrame(scaled_test, columns=test.columns)


def optimize_dataframe(df):
    for col in df.columns:
        col_data = df[col]
        
        # Skip non-numeric columns
        if not np.issubdtype(col_data.dtype, np.number):
            continue

        # Check if all values are integers (i.e., no decimal part)
        if np.all(col_data.dropna() == col_data.dropna().astype(int)):
            df[col] = pd.to_numeric(col_data, downcast="integer")
        else:
            df[col] = pd.to_numeric(col_data, downcast="float")
    
    return df

# Apply optimization
test_data = optimize_dataframe(test_data)

# Add Machine Identifier back
test_data['MachineIdentifier'] = y

# Check memory usage before and after
print(f"Memory usage after optimization: {test_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Memory usage after optimization: 2901.77 MB


In [18]:
#test loader on X
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

X=test_data.drop(columns=['MachineIdentifier'])
y = test_data['MachineIdentifier']

X_test_tensor = torch.tensor(X.values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [19]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_size, 256), 
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),                            
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 1) 
        )

    def forward(self, x):
        return self.model(x)

In [20]:
model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/Saved Models/simplenn_kaggle_encodings.pth"
model = MLP(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [21]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:49<00:00, 77.89it/s]


In [23]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_2_advanced_encodings_submission.csv', index=False)

In [24]:
pca_features=['Census_OSInstallLanguageIdentifier', 'OsPlatformSubRelease', 'Census_InternalPrimaryDisplayResolutionHorizontal', 'dpi', 'Census_InternalPrimaryDisplayResolutionVertical', 'CountryIdentifier', 'IsSxsPassiveMode', 'RtpStateBitfield', 'OrganizationIdentifier', 'CityIdentifier', 'GeoNameIdentifier', 'Census_InternalBatteryNumberOfCharges', 'AvSigVersion', 'Census_HasOpticalDiskDrive', 'Census_ChassisTypeName', 'Census_OSBranch', 'Screen_Area', 'Census_GenuineStateName', 'Census_OSBuildRevision', 'Census_IsVirtualDevice', 'magic_4', 'LocaleEnglishNameIdentifier', 'Census_FirmwareVersionIdentifier', 'Census_OEMModelIdentifier', 'Census_MDC2FormFactor', 'Census_OSWUAutoUpdateOptionsName', 'Wdft_RegionIdentifier', 'AppVersion', 'Wdft_IsGamer', 'Firewall', 'Census_ProcessorModelIdentifier', 'Census_TotalPhysicalRAM', 'Census_FlightRing', 'IeVerIdentifier', 'Census_OEMNameIdentifier', 'DefaultBrowsersIdentifier', 'primary_drive_c_ratio', 'Census_PowerPlatformRoleName', 'Census_ProcessorCoreCount', 'Census_FirmwareManufacturerIdentifier', 'AVProductStatesIdentifier', 'SmartScreen', 'Census_OSInstallTypeName', 'Census_OSVersion', 'EngineVersion', 'aspect_ratio', 'IsProtected', 'AVProductsEnabled', 'Census_IsSecureBootEnabled', 'OsBuildLab', 'Processor', 'Census_ActivationChannel', 'OsBuild', 'Census_SystemVolumeTotalCapacity', 'Census_PrimaryDiskTypeName']

X_test_tensor = torch.tensor(X[pca_features].values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/Saved Models/simplenn_kaggle_encodings_pca.pth"
model = MLP(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [25]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:47<00:00, 79.98it/s]


In [26]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_2_advanced_encodings_pca_submission.csv', index=False)

In [27]:
lda_features=[
    'Firewall', 'Census_OEMModelIdentifier', 'Census_FirmwareManufacturerIdentifier', 'Census_OEMNameIdentifier',
    'IeVerIdentifier', 'Census_PrimaryDiskTypeName', 'IsSxsPassiveMode', 'OsBuildLab', 'AVProductsEnabled',
    'Census_InternalPrimaryDisplayResolutionVertical', 'Screen_Area', 'Census_MDC2FormFactor',
    'Wdft_RegionIdentifier', 'AppVersion', 'AvSigVersion', 'Census_ProcessorModelIdentifier',
    'Census_ProcessorCoreCount', 'Census_FlightRing', 'Census_ActivationChannel', 'Census_SystemVolumeTotalCapacity',
    'Census_OSInstallLanguageIdentifier', 'aspect_ratio', 'RtpStateBitfield', 'Census_IsSecureBootEnabled',
    'Census_GenuineStateName', 'OsSuite', 'LocaleEnglishNameIdentifier', 'OsPlatformSubRelease',
    'Census_TotalPhysicalRAM', 'Census_PowerPlatformRoleName', 'magic_4', 'Census_OSUILocaleIdentifier',
    'Census_HasOpticalDiskDrive', 'DefaultBrowsersIdentifier', 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_OSBranch', 'Census_OSVersion', 'AVProductsInstalled_slim', 'OsBuild',
    'Census_InternalBatteryNumberOfCharges', 'Wdft_IsGamer', 'Census_OSInstallTypeName', 'SkuEdition',
    'IsProtected', 'Census_IsVirtualDevice', 'primary_drive_c_ratio', 'Processor',
    'Census_InternalPrimaryDisplayResolutionHorizontal', 'SmartScreen', 'AVProductStatesIdentifier',
    'EngineVersion', 'dpi', 'AVProductsInstalled', 'Census_OSSkuName', 'Census_OSEdition'
]

X_test_tensor = torch.tensor(X[lda_features].values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/Saved Models/lda_fs_model.pth"
model = MLP(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [28]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:46<00:00, 82.72it/s]


In [29]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_2_advanced_encodings_lda_submission.csv', index=False)

In [30]:
# Define model architecture (same as yours)
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.4),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.4),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [31]:
X_test_tensor = torch.tensor(X.values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/pytorch-models/deepernn_kaggle_encodings.pth"
model = BinaryClassifier(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [32]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:50<00:00, 76.02it/s]


In [33]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_3_advanced_encodings_submission.csv', index=False)

In [35]:
pca_features = [
    'Census_OSInstallLanguageIdentifier', 
    'OsPlatformSubRelease', 
    'Census_InternalPrimaryDisplayResolutionHorizontal', 
    'dpi', 
    'Census_InternalPrimaryDisplayResolutionVertical', 
    'CountryIdentifier', 
    'IsSxsPassiveMode', 
    'RtpStateBitfield', 
    'OrganizationIdentifier', 
    'CityIdentifier', 
    'GeoNameIdentifier', 
    'Census_InternalBatteryNumberOfCharges', 
    'AvSigVersion', 
    'Census_HasOpticalDiskDrive', 
    'Census_ChassisTypeName', 
    'Census_OSBranch', 
    'Screen_Area', 
    'Census_GenuineStateName', 
    'Census_OSBuildRevision', 
    'Census_IsVirtualDevice', 
    'magic_4', 
    'LocaleEnglishNameIdentifier', 
    'Census_FirmwareVersionIdentifier', 
    'Census_OEMModelIdentifier', 
    'Census_MDC2FormFactor', 
    'Census_OSWUAutoUpdateOptionsName', 
    'Wdft_RegionIdentifier', 
    'AppVersion', 
    'Wdft_IsGamer', 
    'Firewall', 
    'Census_ProcessorModelIdentifier', 
    'Census_TotalPhysicalRAM', 
    'Census_FlightRing', 
    'IeVerIdentifier', 
    'Census_OEMNameIdentifier', 
    'DefaultBrowsersIdentifier', 
    'primary_drive_c_ratio', 
    'Census_PowerPlatformRoleName', 
    'Census_ProcessorCoreCount', 
    'Census_FirmwareManufacturerIdentifier', 
    'AVProductStatesIdentifier', 
    'SmartScreen', 
    'Census_OSInstallTypeName',
    'Census_OSVersion', 
    'EngineVersion', 
    'aspect_ratio', 
    'IsProtected', 
    'AVProductsEnabled', 
    'Census_IsSecureBootEnabled', 
    'OsBuildLab', 
    'Processor', 
    'Census_ActivationChannel', 
    'OsBuild', 
    'Census_SystemVolumeTotalCapacity', 
    'Census_PrimaryDiskTypeName'
]

X_test_tensor = torch.tensor(X[pca_features].values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/pytorch-models/deepernn_kaggle_encodings_pca.pth"
model = BinaryClassifier(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [36]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:48<00:00, 79.30it/s]


In [37]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_3_advanced_encodings_pca_submission.csv', index=False)

In [38]:
lda_features = [
    'Firewall', 
    'Census_OEMModelIdentifier', 
    'Census_FirmwareManufacturerIdentifier', 
    'Census_OEMNameIdentifier', 
    'IeVerIdentifier', 
    'Census_PrimaryDiskTypeName', 
    'IsSxsPassiveMode', 
    'OsBuildLab', 
    'AVProductsEnabled', 
    'Census_InternalPrimaryDisplayResolutionVertical', 
    'Screen_Area', 
    'Census_MDC2FormFactor', 
    'Wdft_RegionIdentifier', 
    'AppVersion', 
    'AvSigVersion', 
    'Census_ProcessorModelIdentifier', 
    'Census_ProcessorCoreCount', 
    'Census_FlightRing', 
    'Census_ActivationChannel', 
    'Census_SystemVolumeTotalCapacity', 
    'Census_OSInstallLanguageIdentifier', 
    'aspect_ratio', 
    'RtpStateBitfield', 
    'Census_IsSecureBootEnabled', 
    'Census_GenuineStateName', 
    'OsSuite', 
    'LocaleEnglishNameIdentifier', 
    'OsPlatformSubRelease', 
    'Census_TotalPhysicalRAM', 
    'Census_PowerPlatformRoleName', 
    'magic_4', 
    'Census_OSUILocaleIdentifier', 
    'Census_HasOpticalDiskDrive', 
    'DefaultBrowsersIdentifier', 
    'Census_InternalPrimaryDiagonalDisplaySizeInInches', 
    'Census_OSBranch', 
    'Census_OSVersion', 
    'AVProductsInstalled_slim', 
    'OsBuild', 
    'Census_InternalBatteryNumberOfCharges', 
    'Wdft_IsGamer', 
    'Census_OSInstallTypeName', 
    'SkuEdition', 
    'IsProtected', 
    'Census_IsVirtualDevice', 
    'primary_drive_c_ratio', 
    'Processor', 
    'Census_InternalPrimaryDisplayResolutionHorizontal', 
    'SmartScreen', 
    'AVProductStatesIdentifier', 
    'EngineVersion', 
    'dpi', 
    'AVProductsInstalled', 
    'Census_OSSkuName', 
    'Census_OSEdition'
]

X_test_tensor = torch.tensor(X[lda_features].values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/pytorch-models/deepernn_kaggle_encodings_lda.pth"
model = BinaryClassifier(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [39]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [00:52<00:00, 72.85it/s]


In [40]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_3_advanced_encodings_lda_submission.csv', index=False)

In [43]:
# Define complex model
class EnhancedBinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(EnhancedBinaryClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.4),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.4),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [44]:
X_test_tensor = torch.tensor(X.values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
batch_size = 2048
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_dir = "/Users/dragonfruit/cs6140-ml-workspace/ML-project/pytorch-models/enhanced_deepernn_kaggle_encodings.pth"
model = EnhancedBinaryClassifier(X_test_tensor.shape[1])

# Add map_location parameter to handle CUDA-saved model
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    # Make sure your input data is also on the same device
    X_test_tensor = X_test_tensor.to(device)
else:
    device = torch.device("cpu")
    model = model.to(device)

In [45]:
model.eval()

predictions = []
with torch.no_grad():
    for data in tqdm(test_loader):
        # Move inputs to MPS device
        inputs = data[0].to(device)
        
        # Run inference on MPS
        outputs = model(inputs)
        
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)
        
        # Move probabilities back to CPU and convert to numpy
        predictions.append(probs.cpu().numpy())

predictions = np.concatenate(predictions)
probs = predictions.flatten()

100%|██████████████████████████████████████████████████████████████████████████████████| 3835/3835 [01:01<00:00, 61.89it/s]


In [46]:
submission = pd.DataFrame(
    {'MachineIdentifier': y, 'HasDetections': probs})
submission.to_csv('./submissions/hl_4_advanced_encodings_submission.csv', index=False)