In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle


In [16]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        'HasDetections':                                        'int8'
        }

In [17]:
train = pd.read_csv(r'C:\Users\jithi\OneDrive\Desktop\ML project\microsoft-malware-prediction\train.csv',dtype=dtypes)

In [18]:
def frequency_encoding(df, feature):
    t = df[feature].value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[feature] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

# Assuming train is already defined

columns_to_drop = ['MachineIdentifier']
columns_to_freq_encode = ['AppVersion','AvSigVersion','Census_OSVersion','EngineVersion','OsBuildLab']
columns_for_LE = ['ProductName','RtpStateBitfield','Platform','Processor','OsVer','OsBuild','OsSuite',
                   'OsPlatformSubRelease','OsBuildLab','SkuEdition','PuaMode','SmartScreen','Census_MDC2FormFactor',
                  'Census_DeviceFamily','Census_ProcessorClass','Census_PrimaryDiskTypeName','Census_ChassisTypeName',
                  'Census_PowerPlatformRoleName','Census_InternalBatteryType','Census_OSArchitecture',
                  'Census_OSBranch','Census_OSEdition','Census_OSSkuName','Census_OSInstallTypeName',
                  'Census_OSWUAutoUpdateOptionsName','Census_GenuineStateName','Census_ActivationChannel',
                  'Census_FlightRing']
columns_with_strings = ['ProductName','Platform','Processor','OsPlatformSubRelease','OsBuildLab','SkuEdition',
                        'PuaMode','SmartScreen','Census_MDC2FormFactor','Census_DeviceFamily','Census_PrimaryDiskTypeName',
                        'Census_ChassisTypeName','Census_PowerPlatformRoleName','Census_InternalBatteryType',
                        'Census_OSArchitecture','Census_OSBranch','Census_OSEdition','Census_OSSkuName','Census_OSInstallTypeName',
                        'Census_OSWUAutoUpdateOptionsName','Census_GenuineStateName','Census_ActivationChannel',
                        'Census_FlightRing']

# Drop unwanted columns
for col in columns_to_drop:
    train.drop(columns=[col], inplace=True)

# Convert string columns to lowercase
for col in columns_with_strings:
    train[col] = train[col].str.lower()

# Use category_encoders' CountEncoder for frequency encoding
encoder = ce.CountEncoder(cols=columns_to_freq_encode)

# Apply frequency encoding
train[columns_to_freq_encode] = encoder.fit_transform(train[columns_to_freq_encode])

# Save the encoder for later use on the test set
with open('freq_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Replace NaNs in other columns
for col in train.columns:
    if train[col].dtype == 'object':
        train[col] = train[col].fillna('NA')
    elif train[col].dtype == 'category':
        train[col] = train[col].cat.add_categories('NA').fillna('NA')
    else:
        train[col] = train[col].fillna(-1)

# Label Encoding for specific columns
label_encoders = {}
for col in columns_for_LE:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le

# Save the label encoders for later use
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# Drop a specific row
train.drop(index=5244810, inplace=True)

# Convert float64 and object columns to float32
for col in train.columns:
    if train[col].dtype == 'float64':
        train[col] = train[col].astype('float32')
    elif train[col].dtype == 'object':
        train[col] = train[col].astype('float32')

# Prepare the target and features
y = train['HasDetections']
X = train.drop(columns=['HasDetections'])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [20]:
#train data and labels combined

train_data = pd.DataFrame(X_train, columns=X.columns)
train_data['HasDetections'] = y_train.values

test_data = pd.DataFrame(X_test, columns=X.columns)
test_data['HasDetections'] = y_test.values



In [21]:
import pandas as pd
import numpy as np

def optimize_dataframe(df):
    for col in df.columns:
        col_data = df[col]
        
        # Skip non-numeric columns
        if not np.issubdtype(col_data.dtype, np.number):
            continue

        # Check if all values are integers (i.e., no decimal part)
        if np.all(col_data.dropna() == col_data.dropna().astype(int)):
            df[col] = pd.to_numeric(col_data, downcast="integer")
        else:
            df[col] = pd.to_numeric(col_data, downcast="float")
    
    return df

# Apply optimization
train_data = optimize_dataframe(train_data)
test_data = optimize_dataframe(test_data)

# Check memory usage before and after
print(f"Memory usage after optimization: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Memory usage after optimization: {test_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Memory usage after optimization: 2212.13 MB
Memory usage after optimization: 553.03 MB


In [22]:
train_data['AppVersion'].unique()

array([-0.9473356 , -1.1537133 ,  0.85491246, -1.2061796 , -1.219421  ,
       -1.2185994 , -1.2991827 , -1.2826723 , -1.1904517 , -1.2724934 ,
       -1.2789816 , -1.209765  , -1.2579718 , -1.2231303 , -1.2553535 ,
       -1.3005191 , -1.2992727 , -1.1968336 , -1.2857517 , -1.3045269 ,
       -1.286419  , -1.2955309 , -1.2664205 , -1.2995912 , -1.2945656 ,
       -1.275853  , -1.2863648 , -1.2991453 , -1.2963673 , -1.304387  ,
       -1.2950518 , -1.3019426 , -1.3025709 , -1.304774  , -1.3030912 ,
       -1.30188   , -1.2993807 , -1.302995  , -1.3047479 , -1.3041112 ,
       -1.3045394 , -1.3024944 , -1.3027567 , -1.3029454 , -1.3046706 ,
       -1.3042752 , -1.3009213 , -1.3029442 , -1.3049312 , -1.3043386 ,
       -1.3032115 , -1.304613  , -1.3047992 , -1.3043782 , -1.3042907 ,
       -1.3046113 , -1.3046887 , -1.304587  , -1.304921  , -1.3041633 ,
       -1.3047395 , -1.3048967 , -1.304916  , -1.3042125 , -1.304835  ,
       -1.3045189 , -1.3048564 , -1.304489  , -1.304953  , -1.30

In [23]:
#save train and test now

train_data.to_csv('train_data_scaled_encoded_shrunk.csv', index=False)
test_data.to_csv('test_data_scaled_encoded_shrunk.csv', index=False)

In [29]:
#print the data types as dictionary in the format {column_name: data_type}

print(train_data.dtypes.apply(lambda x: x.name).to_dict())

{'ProductName': 'float32', 'EngineVersion': 'float32', 'AppVersion': 'float32', 'AvSigVersion': 'float32', 'IsBeta': 'float32', 'RtpStateBitfield': 'float32', 'IsSxsPassiveMode': 'float32', 'DefaultBrowsersIdentifier': 'float32', 'AVProductStatesIdentifier': 'float32', 'AVProductsInstalled': 'float32', 'AVProductsEnabled': 'float32', 'HasTpm': 'float32', 'CountryIdentifier': 'float32', 'CityIdentifier': 'float32', 'OrganizationIdentifier': 'float32', 'GeoNameIdentifier': 'float32', 'LocaleEnglishNameIdentifier': 'float32', 'Platform': 'float32', 'Processor': 'float32', 'OsVer': 'float32', 'OsBuild': 'float32', 'OsSuite': 'float32', 'OsPlatformSubRelease': 'float32', 'OsBuildLab': 'float32', 'SkuEdition': 'float32', 'IsProtected': 'float32', 'AutoSampleOptIn': 'float32', 'PuaMode': 'float32', 'SMode': 'float32', 'IeVerIdentifier': 'float32', 'SmartScreen': 'float32', 'Firewall': 'float32', 'UacLuaenable': 'float32', 'Census_MDC2FormFactor': 'float32', 'Census_DeviceFamily': 'float32', '