In [1]:
import numpy as np
import pandas as pd
import os
import warnings
import gc
import time
import urllib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import datetime
from tqdm import tqdm
warnings.simplefilter(action='ignore', category=FutureWarning)

# Plotly library
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import mean_squared_error


import lightgbm as lgb
from scipy.stats import norm, skew

In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

In [5]:
# nrows=100000
retained_columns = numerical_columns + categorical_columns
train = pd.read_csv(r'D:\kaggle_microsoft\train.csv',
                    usecols = retained_columns,
#                     nrows=nrows,
                    dtype = dtypes)

retained_columns += ['MachineIdentifier']
retained_columns.remove('HasDetections')

test = pd.read_csv(r'D:\kaggle_microsoft\test.csv',
                   usecols = retained_columns,
                   dtype = dtypes)

In [6]:
def appversion(x):
    x = x.split(".")
    ret = ""
    l = [1, 2, 5, 5]
    for i, pad in enumerate(l):
        if i < 2:
            ret += x[i].rjust(l[i], '0')
        else:
            ret += x[i].ljust(l[i], '0')

    return ret

def avsigversion(x):
    x = x.split(".")
    ret = ""
    l = [1, 3, 4, 1]
    for i, pad in enumerate(l):
        ret += x[i].ljust(l[i], '0')
    return ret


train['EngineVersion'] = train['EngineVersion'].map(lambda x: "".join(x.split(".")[:])).astype('int64')
train['AppVersion'] = train['AppVersion'].map(appversion).astype('int64')
# try:
#     train["AvSigVersion"] = train['AvSigVersion'].map(avsigversion).astype('int64')
# except Exception as e:
#     print(e)

test['EngineVersion'] = test['EngineVersion'].map(lambda x: "".join(x.split(".")[:])).astype('int64')
test['AppVersion'] = test['AppVersion'].map(appversion).astype('int64')
# test["AvSigVersion"] = test['AvSigVersion'].map(avsigversion).astype('int64')

In [7]:
train.tail()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,AutoSampleOptIn,PuaMode,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_ProcessorClass,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightingInternal,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
8921478,ffffedfe8fcc46e6d36ab39953589fee,win8defender,11151001,4181807018075,1.273.1555.0,0,7.0,0,,53447.0,1.0,1.0,1,66,84963.0,,89.0,88,windows10,x64,10.0.0.0,16299,768,rs3,16299.431.amd64fre.rs3_release_svc_escrow.1805...,Home,1.0,0,,0.0,117.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,585.0,189419.0,4.0,1.0,142.0,,953869.0,HDD,936175.0,0,4096.0,Notebook,15.5,1366.0,768.0,Mobile,,0.0,10.0.16299.431,amd64,rs3_release_svc_escrow,16299,431,Core,CORE,UUPUpgrade,14.0,49,UNKNOWN,0,IS_GENUINE,Retail,,0.0,Retail,,556.0,63069.0,1,,0.0,0,0,0.0,0.0,5.0,1
8921479,ffffef606490b2970873ec0a27ebd24b,win8defender,11133030,4091058667200,1.233.4218.0,0,7.0,0,1078.0,59914.0,2.0,1.0,1,66,82414.0,27.0,89.0,88,windows10,x86,10.0.0.0,10586,256,th2,10586.672.x86fre.th2_release_sec.161024-1825,Pro,0.0,0,,,86.0,RequireAdmin,1.0,1.0,Desktop,Windows.Desktop,1443.0,275958.0,2.0,5.0,3366.0,,76293.0,Unspecified,75741.0,0,2048.0,SpaceSaving,19.0,1280.0,1024.0,SOHOServer,,4294967000.0,10.0.10586.679,x86,th2_release_sec,10586,679,Professional,PROFESSIONAL,Update,14.0,49,FullAuto,0,IS_GENUINE,Retail,0.0,0.0,Retail,0.0,,,0,0.0,0.0,0,0,0.0,0.0,5.0,0
8921480,fffff8a40070d2d8379bb7fa2ed2fa66,win8defender,11152001,4181807018075,1.275.1242.0,0,7.0,0,,53447.0,1.0,1.0,1,43,134580.0,27.0,53.0,42,windows10,x64,10.0.0.0,16299,768,rs3,16299.15.amd64fre.rs3_release.170928-1534,Home,1.0,0,,0.0,117.0,,1.0,1.0,Notebook,Windows.Desktop,4904.0,315307.0,8.0,5.0,3104.0,,244198.0,SSD,242989.0,0,8192.0,Notebook,15.5,1920.0,1080.0,Mobile,,0.0,10.0.16299.371,amd64,rs3_release,16299,371,CoreCountrySpecific,CORE_COUNTRYSPECIFIC,UUPUpgrade,37.0,158,UNKNOWN,0,IS_GENUINE,OEM:DM,,0.0,Retail,,513.0,71061.0,1,,0.0,0,0,0.0,0.0,7.0,1
8921481,fffffbbaaf5969ae4b93e7f3f6d7132f,win8defender,11149014,4161765618052,1.269.1834.0,0,7.0,0,,6630.0,3.0,1.0,1,207,159430.0,33.0,277.0,75,windows10,x64,10.0.0.0,15063,768,rs2,15063.0.amd64fre.rs2_release.170317-1834,Home,1.0,0,,0.0,108.0,,1.0,1.0,Notebook,Windows.Desktop,2102.0,228975.0,2.0,5.0,1998.0,,476940.0,HDD,463486.0,0,4096.0,Notebook,12.703125,1024.0,768.0,Mobile,,0.0,10.0.15063.483,amd64,rs2_release,15063,483,Core,CORE,Other,8.0,31,Notify,0,IS_GENUINE,OEM:DM,,0.0,Retail,,554.0,33142.0,1,,0.0,0,0,0.0,0.0,13.0,1
8921482,ffffff75ba4f33d938ccfdb148b8ea16,win8defender,11152001,4181807018075,1.275.1209.0,0,7.0,0,,7945.0,2.0,1.0,1,159,119079.0,18.0,194.0,74,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,0,,0.0,137.0,ExistsNotSet,1.0,1.0,Notebook,Windows.Desktop,2102.0,242491.0,4.0,5.0,2696.0,,953869.0,HDD,637127.0,0,6144.0,Notebook,15.5,1920.0,1080.0,Mobile,,0.0,10.0.17134.228,amd64,rs4_release,17134,228,Core,CORE,UUPUpgrade,7.0,30,FullAuto,0,IS_GENUINE,OEM:DM,,0.0,Retail,,554.0,33084.0,1,,0.0,0,0,0.0,0.0,11.0,0


In [8]:
true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
    'EngineVersion', 
    'AppVersion'
#     'AvSigVersion'
]

In [9]:
binary_variables = [c for c in train.columns if train[c].nunique() == 2]

In [10]:
categorical_columns = [c for c in train.columns 
                       if (c not in true_numerical_columns) & (c not in binary_variables)]

In [11]:
categorical_columns

['MachineIdentifier',
 'ProductName',
 'EngineVersion',
 'AvSigVersion',
 'RtpStateBitfield',
 'DefaultBrowsersIdentifier',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'AVProductsEnabled',
 'CountryIdentifier',
 'CityIdentifier',
 'OrganizationIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'Platform',
 'Processor',
 'OsVer',
 'OsBuild',
 'OsSuite',
 'OsPlatformSubRelease',
 'OsBuildLab',
 'SkuEdition',
 'IeVerIdentifier',
 'SmartScreen',
 'UacLuaenable',
 'Census_MDC2FormFactor',
 'Census_DeviceFamily',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_ProcessorClass',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_InternalBatteryType',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSBuildNumber',
 'Census_OSBuildRevision',
 'Census_OSE

In [12]:
# variables = {
#     'categorical_columns': len(categorical_columns),
#     'binary_variables': len(binary_variables),
#     'true_numerical_columns': len(true_numerical_columns)
# }
# pie_trace = go.Pie(labels=list(variables.keys()), values=list(variables.values()))
# layout = dict(title= "Variable types", height=400, width=800)
# fig = dict(data=[pie_trace], layout=layout)
# iplot(fig)

In [13]:
cardinality = []
for c in categorical_columns:
    if c == 'MachineIdentifier': continue
    cardinality.append([c, train[c].nunique()])
cardinality.sort(key = lambda x:x[1], reverse=False)

trace = go.Bar(y=[x[0] for x in cardinality],
               x=[x[1] for x in cardinality],
               orientation='h', marker=dict(color='rgb(49,130,189)'), name='train')

layout = go.Layout(
    title='Categorical cardinality', height=1600, width=800,
    xaxis=dict(
        title='Number of categories',
        titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        domain=[0.25, 1]
    ),
    barmode='group',
    bargap=0.1,
    bargroupgap=0.1
)

fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)

In [14]:
cardinality

[['Processor', 3],
 ['Census_DeviceFamily', 3],
 ['Census_ProcessorClass', 3],
 ['Census_OSArchitecture', 3],
 ['Platform', 4],
 ['Census_PrimaryDiskTypeName', 4],
 ['Census_GenuineStateName', 5],
 ['ProductName', 6],
 ['AVProductsEnabled', 6],
 ['Census_OSWUAutoUpdateOptionsName', 6],
 ['Census_ActivationChannel', 6],
 ['RtpStateBitfield', 7],
 ['Census_ProcessorManufacturerIdentifier', 7],
 ['AVProductsInstalled', 8],
 ['SkuEdition', 8],
 ['OsPlatformSubRelease', 9],
 ['Census_OSInstallTypeName', 9],
 ['Census_PowerPlatformRoleName', 10],
 ['Census_FlightRing', 10],
 ['UacLuaenable', 11],
 ['Census_MDC2FormFactor', 13],
 ['OsSuite', 14],
 ['Wdft_RegionIdentifier', 15],
 ['SmartScreen', 21],
 ['Census_OSSkuName', 30],
 ['Census_OSBranch', 32],
 ['Census_OSEdition', 33],
 ['Census_OSInstallLanguageIdentifier', 39],
 ['OrganizationIdentifier', 49],
 ['Census_ChassisTypeName', 52],
 ['OsVer', 58],
 ['EngineVersion', 70],
 ['OsBuild', 76],
 ['Census_InternalBatteryType', 78],
 ['Census_OS

In [15]:
def frequency_encoding(variable):
    t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[variable] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

frequency_encoded_variables = [ x for x, y in cardinality if y > 100 ]

for variable in tqdm(frequency_encoded_variables):
    freq_enc_dict = frequency_encoding(variable)
    train[variable] = train[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    test[variable] = test[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    categorical_columns.remove(variable)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [01:41<00:00,  6.40s/it]


In [16]:
indexer = {}
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    _, indexer[col] = pd.factorize(train[col])
    
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    train[col] = indexer[col].get_indexer(train[col])
    test[col] = indexer[col].get_indexer(test[col])

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:04<00:00,  7.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [22:16<00:00, 37.55s/it]


In [17]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [18]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 1570.70 Mb (65.3% reduction)
Mem. usage decreased to 1413.46 Mb (64.8% reduction)


In [None]:
target = train['HasDetections']
del train['HasDetections']

In [None]:
param = {'num_leaves': 40,
         'min_data_in_leaf': 40, 
         'objective':'binary',
         'nthread': 6,
         'max_depth': -1,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "random_state": 133,
         "verbosity": -1}

In [None]:
max_iter = 10

In [None]:
gc.collect()

201656

In [None]:
folds = KFold(n_splits=10, shuffle=True, random_state=15)
oof = np.zeros(len(train))
categorical_columns = [c for c in categorical_columns if c not in ['MachineIdentifier']]
features = [c for c in train.columns if c not in ['MachineIdentifier']]
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()
start_time= time.time()
score = [0 for _ in range(folds.n_splits)]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature = categorical_columns
                          )
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature = categorical_columns
                          )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds = 200)
    
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    # we perform predictions by chunks
    initial_idx = 0
    chunk_size = 1000000
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = clf.predict(test.iloc[idx][features], num_iteration=clf.best_iteration)
        initial_idx = final_idx
    predictions += current_pred / min(folds.n_splits, max_iter)
   
    print("time elapsed: {:<5.2}h".format((time.time() - start_time) / 3600))
    score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
    if fold_ == max_iter - 1: break
        
if (folds.n_splits == max_iter):
    print("CV score: {:<8.5f}".format(metrics.roc_auc_score(target, oof)))
else:
     print("CV score: {:<8.5f}".format(sum(score) / max_iter))

fold n°0



Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.710732	valid_1's auc: 0.710806
[200]	training's auc: 0.718768	valid_1's auc: 0.718468
[300]	training's auc: 0.723541	valid_1's auc: 0.722916
[400]	training's auc: 0.726539	valid_1's auc: 0.725689
[500]	training's auc: 0.728643	valid_1's auc: 0.727557
[600]	training's auc: 0.730265	valid_1's auc: 0.728966
[700]	training's auc: 0.731444	valid_1's auc: 0.729912
[800]	training's auc: 0.732604	valid_1's auc: 0.730855
[900]	training's auc: 0.733599	valid_1's auc: 0.731626
[1000]	training's auc: 0.734465	valid_1's auc: 0.73228
[1100]	training's auc: 0.735172	valid_1's auc: 0.732745
[1200]	training's auc: 0.735867	valid_1's auc: 0.733217
[1300]	training's auc: 0.736561	valid_1's auc: 0.73368
[1400]	training's auc: 0.737198	valid_1's auc: 0.734082
[1500]	training's auc: 0.737762	valid_1's auc: 0.734399
[1600]	training's auc: 0.738327	valid_1's auc: 0.734735
[1700]	training's auc: 0.738824	valid_1's auc: 0.735

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
sub_df = pd.DataFrame({"MachineIdentifier": test["MachineIdentifier"].values})
sub_df["HasDetections"] = predictions
sub_df[:10]

In [None]:
sub_df.to_csv(r"C:\Users\user\Desktop\submit5.csv", index=False)

In [None]:
sample.head()