In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load Data

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }
train = pd.read_csv('train.csv', dtype=dtypes)
train.shape

In [None]:
test = pd.read_csv('test.csv',dtype=dtypes)
test.shape

In [None]:
train_copy = train[:]
test_copy =  test[:]

In [None]:
droppable_features = []

# 2. Feature Engineering

## 2.1 mostly-missing Columns

In [None]:
(train.isnull().sum()/train.shape[0]).sort_values(ascending=False)

In [None]:
(test.isnull().sum()/test.shape[0]).sort_values(ascending=False)

* There are 2 columns which have more than 99% of missing values and they are useless.

In [None]:
test.shape

In [None]:
droppable_features.append('PuaMode')
droppable_features.append('Census_ProcessorClass')

## 2.2 Too skewed columns

In [None]:
pd.options.display.float_format = '{:,.4f}'.format
sk_df = pd.DataFrame([{'column': c, 'uniq': train[c].nunique(), 'skewness': train[c].value_counts(normalize=True).values[0] * 100} for c in train.columns])
sk_df = sk_df.sort_values('skewness', ascending=False)
sk_df

* There are 12 categorical columns whose majority category covers more than 99% of occurences, and they are useless, too.

In [None]:
droppable_features.extend(sk_df[sk_df.skewness > 99].column.tolist())
droppable_features

In [None]:
# PuaMode is duplicated in the two categories.
droppable_features.remove('PuaMode')

# Drop these columns.
train.drop(droppable_features, axis=1, inplace=True)


In [None]:
test.drop(droppable_features,axis = 1, inplace=True)

In [None]:
train.shape

In [None]:
test.shape

### Fill missing values for columns that have more than 10% of missing values

In [None]:
# Nan Values
null_counts = train.isnull().sum()
null_counts = null_counts / train.shape[0]
null_counts[null_counts > 0.1]

### 4 columns above should be filled missing values.

In [None]:
train.DefaultBrowsersIdentifier.value_counts().head(5) 

Replace missing values with 0.

In [None]:
train.DefaultBrowsersIdentifier.fillna(0, inplace=True)

In [None]:
test.DefaultBrowsersIdentifier.fillna(0, inplace=True)

In [None]:
train.SmartScreen.value_counts()

In [None]:
trans_dict = {
    'off': 'Off', '&#x02;': '2', '&#x01;': '1', 'on': 'On', 'requireadmin': 'RequireAdmin', 'OFF': 'Off', 
    'Promt': 'Prompt', 'requireAdmin': 'RequireAdmin', 'prompt': 'Prompt', 'warn': 'Warn', 
    '00000000': '0', '&#x03;': '3', np.nan: 'NoExist'
}
train.replace({'SmartScreen': trans_dict}, inplace=True)

In [None]:
test.replace({'SmartScreen': trans_dict}, inplace=True)

In [None]:
test.SmartScreen.isnull().sum()

In [None]:
train.shape

In [None]:
train.OrganizationIdentifier.value_counts()

### This column has ID numbers and I think 0 can represent unknown/NA values.

In [None]:
train.replace({'OrganizationIdentifier': {np.nan: 0}}, inplace=True)

In [None]:
test.replace({'OrganizationIdentifier': {np.nan: 0}}, inplace=True)

In [None]:
pd.options.display.max_rows = 99
train.Census_InternalBatteryType.value_counts()

### Census_InternalBatteryType has 75+% of missing values as well as "˙˙˙" and "unkn" values which seem to mean "unknown". So replace these values with "unknown".

In [None]:
trans_dict = {
    '˙˙˙': 'unknown', 'unkn': 'unknown', np.nan: 'unknown'
}
train.replace({'Census_InternalBatteryType': trans_dict}, inplace=True)

In [None]:
trans_dict = {
    '˙˙˙': 'unknown', 'unkn': 'unknown', np.nan: 'unknown'
}
test.replace({'Census_InternalBatteryType': trans_dict}, inplace=True)

In [None]:
test.shape

### Remove missing values from the train.

In [None]:
train_c = train[:]
test_c = test[:]

In [None]:
# train.dropna(inplace=True)
# train.shape
train = train.fillna(pd.Series(9999, index=train.select_dtypes(exclude='category').columns))
test = test.fillna(pd.Series(9999, index=test.select_dtypes(exclude='category').columns))

In [None]:
lst3 = test.columns[test.isna().any()].tolist()
lst2 = train.columns[train.isna().any()].tolist()
print(len(lst3),len(lst2))

In [None]:
train_category_cols = train.select_dtypes(include='category').columns.tolist()
for j in train_category_cols:
    if j in lst2:
        train[j] = train[j].cat.add_categories("D").fillna("D")
lst4 = train.columns[train.isna().any()].tolist()
len(lst4)

In [None]:
test_category_cols = test.select_dtypes(include='category').columns.tolist()
for j in test_category_cols:
    if j in lst3:
        test[j] = test[j].cat.add_categories("D").fillna("D")
lst1 = test.columns[test.isna().any()].tolist()
len(lst1)

Nearly 14% of data has been removed. But I have to think about how to deal with missing values of test dataset...

MachineIdentifier is not useful for prediction of malware detection.

In [None]:
train.drop('MachineIdentifier', axis=1, inplace=True)

In [None]:
test.drop('MachineIdentifier', axis=1, inplace=True)

### Label Encoding for category columns

In [None]:
train['SmartScreen'] = train.SmartScreen.astype('category')
train['Census_InternalBatteryType'] = train.Census_InternalBatteryType.astype('category')

cate_cols = train.select_dtypes(include='category').columns.tolist()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in cate_cols:
    train[col] = le.fit_transform(train[col])


In [None]:
test['SmartScreen'] = test.SmartScreen.astype('category')
test['Census_InternalBatteryType'] = test.Census_InternalBatteryType.astype('category')
test['Census_FlightRing'] = test.Census_FlightRing.astype('category')

cate_cols = test.select_dtypes(include='category').columns.tolist()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in cate_cols:
    test[col] = le.fit_transform(test[col])

In [None]:
# test.fillna(0)

Reduce the memory by codes from https://www.kaggle.com/timon88/load-whole-data-without-any-dtypes

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

%time
train = reduce_mem_usage(train)

In [None]:
test = reduce_mem_usage(test)

## 2.3 Highly correlated features.

As there are still too many features, it is bad to calculate and look at all the correlations at once. So, I grouped them by 10 columns and considered their correlations, and finally calculated all the correlation of remaining features.

In [None]:
cols = train.columns.tolist()

In [None]:
import seaborn as sns

plt.figure(figsize=(10,10))
co_cols = cols[:10]
co_cols.append('HasDetections')
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('Correlation between 1 ~ 10th columns')
plt.show()

There is no columns which have 0.99+ correlation.

In [None]:
corr_remove = []

In [None]:
co_cols = cols[10:20]
co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('Correlation between 11 ~ 20th columns')
plt.show()

Compare and choose the feature which has less unique values.

In [None]:
print(train.Platform.nunique())
print(train.OsVer.nunique())

* `Platform` vs `OsVer` : remove **`Platform`**

In [None]:
corr_remove.append('Platform')

In [None]:
co_cols = cols[20:30]
co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('Correlation between 21 ~ 30th columns')
plt.show()

No features whose correlation is 0.99+.

In [None]:
co_cols = cols[30:40]
co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('Correlation between 31 ~ 40th columns')
plt.show()

Nothing.

In [None]:
co_cols = cols[40:50]
co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('Correlation between 41 ~ 50th columns')
plt.show()

Nothing.

In [None]:
co_cols = cols[50:60]
co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0)
plt.title('Correlation between 51 ~ 60th columns')
plt.show()

In [None]:
print(train.Census_OSEdition.nunique())
print(train.Census_OSSkuName.nunique(), '\n')
print(train.Census_OSInstallLanguageIdentifier.nunique())
print(train.Census_OSUILocaleIdentifier.nunique())


* `Census_OSEdition` vs `Census_OSSkuName`:  remove **`Census_OSSkuName`**
* `Census_OSInstallLanguageIdentifier` vs `Census_OSUILocaleIdentifier`: remove **`Census_OSInstallLanguageIdentifier`**

In [None]:
corr_remove.append('Census_OSSkuName')
corr_remove.append('Census_OSInstallLanguageIdentifier')

In [None]:
co_cols = cols[60:]
#co_cols.append('HasDetections')
plt.figure(figsize=(10,10))
sns.heatmap(train[co_cols].corr(), cmap='RdBu_r', annot=True, center=0)
plt.title('Correlation between from 61th to the last columns')
plt.show()

Nothing here.

In [None]:
corr_remove

Now we have got 3 columns to remove from correlations of 10-group features.

In [None]:
train.drop(corr_remove, axis=1, inplace=True)
test.drop(corr_remove, axis=1, inplace=True)

Now, find cross-group correlated features.

In [None]:
corr = train.corr()
high_corr = (corr >= 0.99).astype('uint8')
plt.figure(figsize=(15,15))
sns.heatmap(high_corr, cmap='RdBu_r', annot=True, center=0.0)
plt.show()


In [None]:
print(train.Census_OSArchitecture.nunique())
print(train.Processor.nunique())

`Census_OSArchitecture` and `Processor` have the same length of unique values. Then which one? Let's compare their correlation to the `HasDetections`.

In [None]:
train[['Census_OSArchitecture', 'Processor', 'HasDetections']].corr()

They seem to be totally same, so anything is OK to remove.

* `Census_OSArchitecture` vs `Processor`: remove **`Processor`**

In [None]:
corr_remove.append('Processor')

In [None]:
droppable_features.extend(corr_remove)
print(len(droppable_features))
droppable_features

In [None]:
train.shape

In [None]:
train.head()

In [None]:
y_train = train['HasDetections']
train.drop(['HasDetections'],axis = 1)

In [None]:
for train_col in train.columns:
    if train_col not in test.columns:
        train.drop([train_col],axis = 1,inplace = True)
train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train = sc.fit_transform(train)

In [None]:
print(train.shape,test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=9,random_state=0)
clf.fit(train,y_train)

In [None]:
y_predict = clf.predict(test)

In [None]:
# print(y_predict)

In [None]:
# from sklearn.metrics import mean_squared_error
# import math
# math.sqrt(mean_squared_error(Y_test, y_predict))

In [None]:
print(len(y_predict))

In [None]:
print(len(np.array(Y_test)))

In [None]:
# from sklearn.metrics import accuracy_score
# accuracy_score(np.array(y_predict),np.array(Y_test),normalize=False)

In [None]:
for test_col in test.columns:
    if test_col not in train.columns:
        test.drop([test_col],axis = 1,inplace = True)
test.shape

In [None]:
y_predict_test = clf.predict(test)

In [None]:
len(y_predict_test)

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission_array = np.array(submission)

In [None]:
print(submission.head())

In [None]:
submission['HasDetections'] = y_predict

In [None]:
submission.head()

In [None]:
submission.to_csv("submission_file.csv",index=False)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train, y_train, test_size=0.20, random_state=42)

In [None]:
import lightgbm as lgb
def run_lgb(train_X, train_y, val_X, val_y):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=1000, verbose_eval=100)
    
#     pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return  model, pred_val_y

# Training the model #
lgb_model, y_predict_lgb = run_lgb(X_train, Y_train , X_test, Y_test)
pred_test = lgb_model.predict(test)
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(Y_test, y_predict_lgb))

In [None]:
submission['HasDetections'] = pred_test
submission.to_csv("submission_file_lgb.csv",index=False)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
classifier = Sequential()

In [None]:
train.shape

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 45, init = 'uniform', activation = 'relu', input_dim = 65))# Adding the second hidden layer
classifier.add(Dense(output_dim = 36, init = 'uniform', activation = 'relu'))# Adding the output layer
classifier.add(Dense(output_dim = 24, init = 'uniform', activation = 'relu'))# Adding the output layer
classifier.add(Dense(output_dim = 12, init = 'uniform', activation = 'relu'))# Adding the output layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu'))# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))

In [None]:
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
classifier.fit(train, y_train, batch_size = 32, nb_epoch = 10)

In [None]:
y_pred_nn = classifier.predict(test)

In [None]:
# from sklearn.metrics import mean_squared_error
# import math
# math.sqrt(mean_squared_error(Y_test, y_pred_nn))

In [None]:
# from sklearn.metrics import accuracy_score
# accuracy_score(np.array(y_pred_nn),np.array(Y_test),normalize=False)

In [None]:
max(y_pred_nn)