In [13]:
#Generic Imports for the moment

%matplotlib inline
import numpy as np
import scipy as sp
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import patsy
import sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import sklearn.neighbors
import sys
import statsmodels.formula.api as smf
import statsmodels.api as sm

#Note, initally loading the train data alone took approximately 3-4gb of ram

In [2]:
#Change data types to reduce space. Total memory usage drops to 3.6gb
#utilized one of the suggestions on the notebooks to cut down on loading/storage. 
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [3]:
train_df = pd.read_csv('train.csv', dtype=dtypes)
#train_df.info()

#test_df = pd.read_csv("test.csv", dtype=dtypes)

In [None]:
train_df

# Missing Data

In [None]:
total = train_df.isnull().sum()

In [None]:
#Examine which columns are missing significant chunks of data.
rows = holder = total / 8921483
total.sort_values()
#Percentage missing
holder = total / 8921483
print(holder.sort_values().tail(25))

At this point we see various columns that are missing a large percentage of data. The last 9 columns are missing at least 30% or more of their data, with 2 columns missing 99% of their data. I could fill in these values, but given the last of data this seems like a futile effort. The columns that are missing around 30-35% could potentially be manageable, but there's a significant portion of their data missing and I have decided to remove those as well.

In [None]:
train_df_2 = train_df.drop(['PuaMode', 'Census_ProcessorClass', 'DefaultBrowsersIdentifier', 'Census_IsFlightingInternal', 'Census_InternalBatteryType', 'Census_ThresholdOptIn',
                            'Census_IsWIMBootEnabled', 'SmartScreen', 'OrganizationIdentifier'], axis = 1)

In [None]:
train_df_2

In [None]:
y_md,X_md = patsy.dmatrices("C(HasDetections,[[0],[1]]) ~  0 + ProductName + EngineVersion + AppVersion + IsBeta", data=train_df)
#print('y is', y_ti.shape, 'X is', X_ti.shape)

gen_lr = sklearn.linear_model.LogisticRegression()
dm = gen_lr.fit(X_md, y_md.ravel())

y_md_hat = dm.predict(X_md)
#y_ti_hat_p = dm.predict_proba(X_ti)

print('Accuracy is', sklearn.metrics.accuracy_score(y_md, y_md_hat))


In [None]:
print(len(y_md_hat))
print(len(y_md))

In [None]:
print(dm)

In [10]:
allowed_factors = [    'MachineIdentifier',
        'ProductName',
        'EngineVersion',
        'AppVersion',
        'AvSigVersion',
        'IsBeta',
        'RtpStateBitfield',
        'IsSxsPassiveMode',
        'DefaultBrowsersIdentifier',
        'AVProductStatesIdentifier',
        'AVProductsInstalled',
        'AVProductsEnabled',
        'HasTpm',
        'CountryIdentifier',
        'CityIdentifier',
        'OrganizationIdentifier',
        'GeoNameIdentifier',
        'LocaleEnglishNameIdentifier',
        'Platform',
        'Processor',
        'OsVer',
        'OsBuild',
        'OsSuite',
        'OsPlatformSubRelease',
        'OsBuildLab',
        'SkuEdition',
        'IsProtected',
        'AutoSampleOptIn',
        'PuaMode',
        'SMode',
        'IeVerIdentifier',
        'SmartScreen',
        'Firewall',
        'UacLuaenable', #
        'Census_MDC2FormFactor',
        'Census_DeviceFamily',
        'Census_OEMNameIdentifier', # was 'float16'
        'Census_OEMModelIdentifier',
        'Census_ProcessorCoreCount',
        'Census_ProcessorManufacturerIdentifier',
        'Census_ProcessorModelIdentifier', # was 'float16'
        'Census_ProcessorClass',
        'Census_PrimaryDiskTotalCapacity', # was 'float32'
        'Census_PrimaryDiskTypeName',
        'Census_SystemVolumeTotalCapacity', # was 'float32'
        'Census_HasOpticalDiskDrive',
        'Census_TotalPhysicalRAM',
        'Census_ChassisTypeName',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical', # was 'float16'
        'Census_PowerPlatformRoleName',
        'Census_InternalBatteryType',
        'Census_InternalBatteryNumberOfCharges', # was 'float32'
        'Census_OSVersion',
        'Census_OSArchitecture',
        'Census_OSBranch',
        'Census_OSBuildNumber',
        'Census_OSBuildRevision',
        'Census_OSEdition',
        'Census_OSSkuName',
        'Census_OSInstallTypeName',
        'Census_OSInstallLanguageIdentifier',
        'Census_OSUILocaleIdentifier',
        'Census_OSWUAutoUpdateOptionsName',
        'Census_IsPortableOperatingSystem',
        'Census_GenuineStateName',
        'Census_ActivationChannel',
        'Census_IsFlightingInternal',
        'Census_IsFlightsDisabled',
        'Census_FlightRing',
        'Census_ThresholdOptIn',
        'Census_FirmwareManufacturerIdentifier',
        'Census_FirmwareVersionIdentifier',
        'Census_IsSecureBootEnabled',
        'Census_IsWIMBootEnabled',
        'Census_IsVirtualDevice',
        'Census_IsTouchEnabled',
        'Census_IsPenCapable',
        'Census_IsAlwaysOnAlwaysConnectedCapable',
        'Wdft_IsGamer',
        'Wdft_RegionIdentifier',
        'HasDetections']

In [14]:
def calc_srr(data, k, Y):
    holdsy = 0
    name = ''
    name2 = ''
    name3 = ''
    name4 = ''
    name5 = ''
    
    if (k >= 1):
        for i in range(len(allowed_factors)):
            formula_entry = Y + "~" + allowed_factors[i]
            beta_holder = smf.ols(formula=formula_entry, data=data).fit()
            rsquared_hold = beta_holder.rsquared_adj
            if rsquared_hold > holdsy:
                holdsy = rsquared_hold
                name = allowed_factors[i]          

    if (k >= 2):
        for j in range(len(allowed_factors)):
            formula_entry2 = Y + " ~ " + name + " + " + allowed_factors[j]
            beta_holder2 = smf.ols(formula=formula_entry2, data=data).fit()
            #print(formula_entry2)
            rsquared_hold = beta_holder2.rsquared_adj
            #print(rsquared_hold2)
            if rsquared_hold > holdsy:
                holdsy = rsquared_hold
                name2 = allowed_factors[j]
                
    if (k >= 3):
        for n in range(len(allowed_factors)):
            formula_entry3 = Y + " ~ " + name + " + " + name2 + " + " + allowed_factors[n]
            beta_holder3 = smf.ols(formula=formula_entry3, data=data).fit()
            rsquared_hold = beta_holder3.rsquared_adj
            if rsquared_hold > holdsy:
                holdsy = rsquared_hold
                name3 = allowed_factors[n]
                
    if (k >= 4):
        for l in range(len(allowed_factors)):
            formula_entry4 = Y + " ~ " + name + " + " + name2 + " + " + name3 + " + " + allowed_factors[l]
            beta_holder4 = smf.ols(formula=formula_entry4, data=data).fit()
            rsquared_hold = beta_holder4.rsquared_adj
            if rsquared_hold > holdsy:
                holdsy = rsquared_hold
                name4 = allowed_factors[l]
                
    if (k >= 5):
        for m in range(len(allowed_factors)):
            formula_entry5 = Y + " ~ " + name + " + " + name2 + " + " + name3 + " + " + name4 + " + " + allowed_factors[m]
            beta_holder5 = smf.ols(formula=formula_entry5, data=data).fit()
            rsquared_hold = beta_holder5.rsquared_adj
            if rsquared_hold > holdsy:
                holdsy = rsquared_hold
                name5 = allowed_factors[m]
                            
    return holdsy, name, name2, name3, name4, name5

In [None]:
calc_srr(train_df, 5, 'HasDetections')