In [83]:
import pandas as pd 
import numpy as np 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
usefulColumns = [
    'NumAntivirusProductsInstalled', # numeric taken
    'ProcessorCoreCount', #numeric taken 
    'Processor', #categorical taken 
    'SKUEditionName', #categorical taken 
    'OSEdition', #  categorical taken 
    'OSBuildNumber', # numeric categorical taken 
    'ChassisType',   #categorical  taken 
    'AppVersion', # numeric categorical taken 
    'IsSystemProtected', #bool 
    'IsPassiveModeEnabled', # bool
    'AntivirusConfigID', # categorical taken 
    'FirewallEnabled',
    'target' #bools
]
usefulColumnsTest = [
    'NumAntivirusProductsInstalled', # numeric taken
    'ProcessorCoreCount', #numeric taken 
    'Processor', #categorical taken 
    'SKUEditionName', #categorical taken 
    'OSEdition', #  categorical taken 
    'OSBuildNumber', # numeric categorical taken 
    'ChassisType',   #categorical  taken 
    'AppVersion', # numeric categorical taken 
    'IsSystemProtected', #bool 
    'IsPassiveModeEnabled', # bool
    'AntivirusConfigID', # categorical taken 
    'FirewallEnabled', #bool
]

In [3]:
usefulColumns

['NumAntivirusProductsInstalled',
 'ProcessorCoreCount',
 'Processor',
 'SKUEditionName',
 'OSEdition',
 'OSBuildNumber',
 'ChassisType',
 'AppVersion',
 'IsSystemProtected',
 'IsPassiveModeEnabled',
 'AntivirusConfigID',
 'FirewallEnabled',
 'target']

In [4]:
df = pd.read_csv('../data/train_data.csv')
df[df['target'] == 1]['target'].shape

(33093,)

In [5]:
df = df[usefulColumns]


df['AntivirusConfigID'] =df['AntivirusConfigID'].astype ('string')
df.columns
df['AntivirusConfigID'].isna().sum()

np.int64(45)

In [6]:
numeric = [
    'NumAntivirusProductsInstalled', # numeric
    'ProcessorCoreCount', #numeric
]



#print(df['NumAntivirusProductsInstalled'].isnull().sum())
#print(df['ProcessorCoreCount'].isnull().sum())



In [7]:
categorical = [
    'OSBuildNumber', #No null values found
    'OSEdition', # No null values (missing values) found
    'ChassisType', # 1 null value Labeled as other 
    'Processor', # No null values found 
    'SKUEditionName', #No null values found
    'AppVersion', ##No null values found
    'AntivirusConfigID'
]
#print(df['OSBuildNumber'].nunique())
# df['OSBuildNumber'].isnull().sum()
# df['OSEdition'].isnull().sum()
#print(df['OSEdition'].nunique())
#print(df['ChassisType'].isnull().sum())
#df['Processor'].isnull().sum()
#df['SKUEditionName'].isnull().sum()
#df['AppVersion'].isnull().sum()
#print(df['AntivirusConfigID'].isnull().sum())


In [8]:

boolean_columns = [
    'IsSystemProtected',
'IsPassiveModeEnabled', # no null values . 
'FirewallEnabled',
]


In [9]:
print(set(boolean_columns) & set(categorical))
print(set(categorical) & set(numeric))
print(set(boolean_columns) & set(numeric))
# Checking for overlapping. 

set()
set()
set()


Grouping the columns with noise manually 

In [10]:
def group_categories_plot(df , column, no_columns):
    """
        Grouping the columns based on top n columns by count
    """
    top_editions = df[column].value_counts().nlargest(no_columns).index
    print("Categories to keep:", top_editions)
    df[column] = np.where(df[column].isin(top_editions) | df[column].isnull() , 
                                         df[column], 
                                         'Other')

In [11]:
print(df['SKUEditionName'].isnull().sum())
group_categories_plot(df, 'SKUEditionName' , 2)
print(df['SKUEditionName'].isnull().sum())

0
Categories to keep: Index(['Home', 'Pro'], dtype='object', name='SKUEditionName')
0


In [12]:
print(df['OSEdition'].isnull().sum())
group_categories_plot(df, 'OSEdition' , 3)
print(df['OSEdition'].isnull().sum())

0
Categories to keep: Index(['Core', 'Professional', 'CoreSingleLanguage'], dtype='object', name='OSEdition')
0


In [13]:

print(df['OSBuildNumber'].isnull().sum())
group_categories_plot(df, 'OSBuildNumber' , 5)
print(df['OSBuildNumber'].isnull().sum())

0
Categories to keep: Index([17134, 16299, 15063, 14393, 10586], dtype='int64', name='OSBuildNumber')
0


In [14]:
print(df['ChassisType'].isnull().sum())
group_categories_plot(df, 'ChassisType' , 4)
print(df['ChassisType'].isnull().sum())

1
Categories to keep: Index(['Notebook', 'Desktop', 'Laptop', 'Portable'], dtype='object', name='ChassisType')
1


In [15]:
print(df['AppVersion'].isnull().sum())
group_categories_plot(df, 'AppVersion' , 5)
print(df['AppVersion'].isnull().sum())

0
Categories to keep: Index(['4.18.1807.18075', '4.18.1806.18062', '4.12.16299.15',
       '4.16.17656.18052', '4.14.17639.18041'],
      dtype='object', name='AppVersion')
0


In [16]:
print(df['AntivirusConfigID'].isnull().sum())
group_categories_plot(df, 'AntivirusConfigID' , 5)
print(df['AntivirusConfigID'].isnull().sum())

45
Categories to keep: Index(['53447.0', '7945.0', '47238.0', '62773.0', '46413.0'], dtype='string', name='AntivirusConfigID')
45


#Pipelining data preprocessing 


In [17]:
numeric_steps= [('imputer', SimpleImputer(strategy='median'))]
boolean_steps = [('imputer', SimpleImputer(strategy = 'most_frequent'))]
categorical_steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'Other')),('encoder', OneHotEncoder(handle_unknown='ignore')) ]

In [18]:
numeric_pipeline = Pipeline(steps = numeric_steps)
boolean_pipeline = Pipeline(steps = boolean_steps) 
categorical_pipeline = Pipeline(steps = categorical_steps)


In [19]:
ColumnTransform = ColumnTransformer(
    transformers = [
        ('numerical' , numeric_pipeline, numeric), 
        ('categorical',categorical_pipeline ,categorical),
        ('bool', boolean_pipeline, boolean_columns) ]
    
)

In [20]:
X = df.drop(columns= ['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.where(pd.notna(X_train), np.nan)
X_test  = X_test.where(pd.notna(X_test), np.nan)



In [21]:
X_train = ColumnTransform.fit_transform(X_train)
X_test = ColumnTransform.transform(X_test)

In [22]:
X_train.shape

(52428, 38)

In [86]:
Logistic = LogisticRegression()
Logistic.fit(X_train, y_train)
LogisticPredictions = Logistic.predict(X_test)


In [87]:
print(confusion_matrix(y_test, LogisticPredictions))
print(accuracy_score(y_test, LogisticPredictions))
print(classification_report(y_test, LogisticPredictions))


[[3349 3082]
 [2150 4526]]
0.6008239871824216
              precision    recall  f1-score   support

           0       0.61      0.52      0.56      6431
           1       0.59      0.68      0.63      6676

    accuracy                           0.60     13107
   macro avg       0.60      0.60      0.60     13107
weighted avg       0.60      0.60      0.60     13107



In [88]:
DecisionTree = DecisionTreeClassifier(max_depth =5)

In [89]:
DecisionTree.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [90]:
DecisionTreePredictions = DecisionTree.predict(X_test)
print(confusion_matrix(y_test, DecisionTreePredictions))
print(classification_report(y_test, DecisionTreePredictions))
accuracy_score(y_test, DecisionTreePredictions)

[[3391 3040]
 [2190 4486]]
              precision    recall  f1-score   support

           0       0.61      0.53      0.56      6431
           1       0.60      0.67      0.63      6676

    accuracy                           0.60     13107
   macro avg       0.60      0.60      0.60     13107
weighted avg       0.60      0.60      0.60     13107



0.6009765774013885

Conclusion: Single decision tree couldn't uncover non-linear features this shows HIGH Overlapping and might need to ensemble different results. 

Next - Random Forest Classifier. 



In [93]:
RandomForest = RandomForestClassifier()

In [96]:
RandomForest.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [97]:
RadomForestPredictions = RandomForest.predict(X_test)

In [103]:
print(confusion_matrix(y_test, RadomForestPredictions))
print(accuracy_score(y_test, RadomForestPredictions))
print(classification_report(y_test, RadomForestPredictions))

[[3146 3285]
 [2266 4410]]
0.5764858472571908
              precision    recall  f1-score   support

           0       0.58      0.49      0.53      6431
           1       0.57      0.66      0.61      6676

    accuracy                           0.58     13107
   macro avg       0.58      0.57      0.57     13107
weighted avg       0.58      0.58      0.57     13107

