In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

/kaggle/input/System-Threat-Forecaster/sample_submission.csv
/kaggle/input/System-Threat-Forecaster/train.csv
/kaggle/input/System-Threat-Forecaster/test.csv


# Loading the datasets

In [None]:
train_data = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
test_data = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")
print(train_data.shape)
print(test_data.shape)

(100000, 76)
(10000, 75)


In [None]:
df = train_data.copy()

In [None]:
df.isna().sum()

MachineID             0
ProductName           0
EngineVersion         0
AppVersion            0
SignatureVersion      0
                   ... 
IsGamer             559
RegionIdentifier    559
DateAS                0
DateOS               24
target                0
Length: 76, dtype: int64

----

# EXPLORATIVE DATA ANALYSIS

### Data Preparation

In [None]:
# print(df.shape)
# df.info()

### Some Observations:
1) There are 100000 row enteries in the training dataset (before cleaning of the dataset), with 76 column attributes, with one being the target column 'y'.
2) Out of the 76 attributes, 31 are of float data type, 17 are of interger data type, and 28 are of data type object.


In [None]:

# Summary statistics for numerical columns
print("\nSummary Statistics:")
df.describe()


In [None]:
#noting all the qualitative columns, i.e., columns with object data type
qualitative_columns = df.select_dtypes(include='object').columns
qualitative_columns

In [None]:
df[qualitative_columns].head()

In [None]:
# # change date into date format
# df['DateAS'] = pd.to_datetime(df['DateAS'])
# df['DateOS'] = pd.to_datetime(df['DateOS'])

In [None]:
#getting count for all qualitative columns
for col in qualitative_columns :
    print(f"{col}: {df[col].nunique()}")

In [None]:
#same thing but for num columns
numerical_columns = df.select_dtypes(include='number').columns
numerical_columns

In [None]:
for col in numerical_columns :
    print(f"{col}: {df[col].nunique()}")

In [None]:
redundant_features = ['IsFlightsDisabled', 'IsBetaUser','AutoSampleSubmissionEnabled','DeviceFamily','MachineID']

# df.drop(redundant_features, axis=1, inplace=True)

for feature in redundant_features :
    if feature in df.columns :
        df.drop(columns=feature, inplace=True)

In [None]:
df.head()

In [None]:
bool_features = []
for col in df.columns:
  if df[col].nunique() == 2 :
    bool_features += [col]

bool_features

### Feature Understanding

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable')
plt.xlabel('Target')
plt.ylabel('Count')

In [None]:
numerical_columns = df.select_dtypes(include='number').columns
df[numerical_columns].hist(figsize=(20, 20), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Distributions of Numerical Features\n")
plt.tight_layout()
plt.show()


From the histograms, here are some general observations and inferences:

1) <u> Binary Features </u>:
    Many features (like `IsBetaUser`, `IsSystemProtected`, `IsSecureBootEnabled`, etc.) are binary, meaning perhaps common system settings; whereas, some binary features appear highly imbalanced (e.g., `RealTimeProtectionState`, `IsFlightDisabled`, `IsTouchEnabled`), which suggests most systems have similar settings.

2) <u> Skewed Distributions </u>:
    Several features, like `PrimaryDiskCapacityMB`, `SystemVolumeCapacityMB`, `TotalPhysicalRAMMB`, have a right-skewed distribution, meaning a small number of systems have significantly larger disk or RAM capacities, hence suggesting most systems have standard specs and only few people with high-ends ones.

3) <u> Categorical Variables </u>:
    Features such as `CityID`, `LocaleEnglishNameID`, `OSBuildNumber`, `ProcessorModelID`, and `CEMNameID` have a large number of unique values representing categorical values. Some of these show a dominance which explains there being some configurations or locations that are more common than others.

4) <u> Possible Correlations </u>:
    Features like `ProcessorCoreCount`, `TotalPhysicalRAMMB`, `PrimaryDiskCapacityMB` likely correlate since high-end PCs would have better overall hardware in general, as do software-related features like `OSBuildNumber`, `OSProductSuite` and `OSLocaleID`. This is someone we could check in the future.

5) <u> Possible Outliers </u>:
    Features like `ProcessorCoreCount` and `InternalBatteryNumberOfCharges` show some extreme values, meaning there could be outliers. We may need to further look into this in the next steps of EDA.

### Plotting a heatmap to visualize correlation between features

Note: As mentioned before, plotting only points with strong correlation. This also helps avoid clutter, and makes it easier to focus on the important parts. For such, the threshold for "strong correlation" is an absolute value of 0.5  

In [None]:
corr_matrix = df.select_dtypes(exclude='object').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm',linewidths="0.5", xticklabels=True, yticklabels=True)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
for feature in bool_features:
  plt.figure(figsize=(7, 3))
  sns.countplot(data=df, x=feature, hue='target')
  plt.xlabel(feature, fontsize=12)
  plt.ylabel('Density', fontsize=12)
  plt.show()

### Plotting boxplot graphs to look for outliers

In [None]:
features = ['ProcessorCoreCount', 'OSBuildNumber']
for feature in features :
    plt.figure(figsize=(11,5))
    sns.boxplot(x='target', y=feature, data=df, palette='coolwarm')
    plt.title(f"{feature} vs Target", fontsize=14, fontweight='bold')
    plt.xlabel("Target", fontsize=12)
    plt.ylabel(feature, fontsize=12)
    plt.tight_layout()
    plt.show()

### Inferences

1) ProcessorCoreCount vs Target:
    Most system have a low core count (2 to 8), and there are outliers with very high core counts (>10). However, the distribution looks similar for both classes (target 1 and 0), which would mean `ProcessorCoreCount` may ***not be a strong differentiator*** between malware-infected (1) and non-infected (0) systems.
   
2) OSBuildNumber vs Target:
    On the other hand, the `OSBuildNumber` distribution is noticeably different between the two target classes. Non-infected (target=0) or malware-free systems have a wider range of OS build versions, with extreme lower outliers. Whereas, Infected systems (target = 1) appear to have a more concentrated distribution with lower OS build numbers. This suggests that it is possible ***older OS builds are more prone to malwares*** or that ***newer builds are more resistant***.


Naturally, these do make sense because in malware detection, software-related factors, such as the OS version,installed security patches and system configurations tend to have a stronger impact than hardware specs like CPU core count. The boxplot *suggests* that older OS builds may be more vulnerable, which aligns with our understanding that outdated systems lack security-patches, making them easier targets.

----

# MODEL BUILDING

cleaning data for model building

In [None]:
#to work on the original data set
df = train_data.copy()
df.isna().sum()

MachineID             0
ProductName           0
EngineVersion         0
AppVersion            0
SignatureVersion      0
                   ... 
IsGamer             559
RegionIdentifier    559
DateAS                0
DateOS               24
target                0
Length: 76, dtype: int64

In [None]:
#checking missing values
missing_values = df.isna().sum()
missing_values = missing_values[missing_values > 0]
missing_values

RealTimeProtectionState                66
AntivirusConfigID                      76
NumAntivirusProductsInstalled          76
NumAntivirusProductsEnabled            76
CityID                                623
IsSystemProtected                      76
SMode                                 981
IEVersionID                           107
FirewallEnabled                       166
EnableLUA                              19
OEMNameID                             212
OEMModelID                            228
ProcessorCoreCount                     85
ProcessorManufacturerID                85
ProcessorModelID                       85
PrimaryDiskCapacityMB                 110
PrimaryDiskType                        23
SystemVolumeCapacityMB                110
TotalPhysicalRAMMB                    151
ChassisType                             2
PrimaryDisplayDiagonalInches           72
PrimaryDisplayResolutionHorizontal     72
PrimaryDisplayResolutionVertical       72
InternalBatteryNumberOfCharges    

In [None]:
# df['DateAS'] = pd.to_datetime(df['DateAS'])
# df['DateOS'] = pd.to_datetime(df['DateOS'])

In [None]:
redundant_features = ['IsFlightsDisabled', 'IsBetaUser','AutoSampleSubmissionEnabled','DeviceFamily','MachineID']

for feature in redundant_features :
    if feature in df.columns :
        df.drop(columns=feature, inplace=True)

df.shape

(100000, 71)

In [None]:
# if the column has too many missing values, we drop it
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

In [None]:
#handle missing num and non-num values, by substituting mean and mode inplace respectively

numerical_col = df.select_dtypes(include=np.number).columns
categorical_col = df.select_dtypes(exclude=np.number).columns

for col in numerical_col:
  df[col].fillna(df[col].mean(), inplace=True)

for col in categorical_col:
  df[col].fillna(df[col].mode()[0], inplace=True)

train-validation split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42, stratify=df['target'])

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((80000, 70), (80000,), (20000, 70), (20000,))

data processing

In [None]:
#i know i have handled the missing data before, but i wanted to show the method using SimpleImputer() aswell
#i DO realize that this code is somewhat redundant, just wanted to cover all bases..

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

numerical_col = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_col = X_train.select_dtypes(include=['object']).columns

if len(numerical_col) > 0 :
  num_imputer = SimpleImputer(strategy='mean')
  X_train[numerical_col] = num_imputer.fit_transform(X_train[numerical_col])
  X_val[numerical_col] = num_imputer.transform(X_val[numerical_col])

if len(categorical_col) > 0 :
  cat_imputer = SimpleImputer(strategy='most_frequent')
  X_train[categorical_col] = cat_imputer.fit_transform(X_train[categorical_col])
  X_val[categorical_col] = cat_imputer.transform(X_val[categorical_col])

In [None]:
scaler = StandardScaler()
X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])
X_val[numerical_col] = scaler.transform(X_val[numerical_col])

# decision tree gives errors for unknown values in columns, that is the validation set but not in the training set,
# or vice versa
# to handle that, we will look at all possible 'unknown values'

encoders = {}
for col in categorical_col :
  # ValueError: y contains previously unseen labels: '2e5330d28fbc38a45c1f733f7f61bac8'
  train_unique = set(X_train[col].unique())
  val_unique = set(X_val[col].unique())

  unknowns = val_unique - train_unique

  if unknowns :
    most_frequent = X_train[col].mode()[0]
    X_train.loc[X_train[col].isin(unknowns), col] = most_frequent
    X_val.loc[X_val[col].isin(unknowns), col] = most_frequent

  le = LabelEncoder()
  X_train[col] = le.fit_transform(X_train[col])
  X_val[col] = le.transform(X_val[col])
  encoders[col] = le

### Models:
* random forest
* light bgm
* ada boost classifier
* xgboost
* logistic regression

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42, class_weight='balanced')

rf_params = {
    'n_estimators': [100, 200],
    'max_depth' : [10, None],
    'min_samples_split' : [5, 10 ,20],
    'min_samples_leaf' : [2, 5]
}

rf_grid = GridSearchCV(
    rf,
    rf_params,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)
rf_grid.best_params_

rf_best_model = rf_grid.best_estimator_
rf_best_model

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
from sklearn.metrics import accuracy_score

y_pred_rf = rf_best_model.predict(X_val)

print(f"Accuracy: {accuracy_score(y_val,y_pred_rf)}")

Accuracy: 0.6233


## LightGBM

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV

lgb = LGBMClassifier(random_state=42)

param_grid_lgb = {
    'n_estimators': [150, 200],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_samples': [10, 20, 30],
    'reg_alpha': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 1, 5)
}

lgb_random_search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_grid_lgb,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

lgb_random_search.fit(X_train, y_train)

lgb_best_model = lgb_random_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 40420, number of negative: 39580
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4789
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505250 -> initscore=0.021001
[LightGBM] [Info] Start training from score 0.021001


In [None]:
y_pred_lgb = lgb_best_model.predict(X_val)

print(f"Accuracy Score: {accuracy_score(y_val, y_pred_lgb)}")

Accuracy Score: 0.6313


In [None]:
lgb_best_model

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


abc = AdaBoostClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [5, 10],
    'algorithm': ['SAMME'],
    'learning_rate': [0.01, 0.05, 0.1, 1.0],
}

ada_grid = GridSearchCV(
    estimator=abc,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

ada_grid.fit(X_train, y_train)

ada_best_model = ada_grid.best_estimator_
ada_best_model

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
y_pred_ada = ada_best_model.predict(X_val)
print(f'accuracy_score: {accuracy_score(y_val, y_pred_ada)}')

accuracy_score: 0.61575


## XGBoost

In [None]:
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

xgb = xgb.XGBClassifier(random_state=42)

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

xgb_grid = GridSearchCV(
    xgb,
    xgb_params,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)


xgb_grid.fit(X_train, y_train)

xgb_best_model = xgb_grid.best_estimator_

In [None]:
# Predict on test data
y_pred_xgb = xgb_best_model.predict(X_val)

# Accuracy
print(f"Accuracy: {accuracy_score(y_val, y_pred_xgb)}")

Accuracy: 0.62775


In [None]:
xgb_best_model

In [None]:
#xgb = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=6, random_state=42)

<table border="1">
  <thead>
    <tr>
      <th>Model</th>
      <th>Accuracy Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>RandomForest</td>
      <td>0.6257</td>
    </tr>
    <tr>
      <td>LightGBM</td>
      <td>0.6271</td>
    </tr>
    <tr>
      <td>AdaBoostClassifier</td>
      <td>0.43</td>
    </tr>
    <tr>
      <td>DecisionTree</td>
      <td>0.5634</td>
    </tr>
    <tr>
      <td>XGBoost</td>
      <td>0.6243</td>
    </tr>
  </tbody>
</table>


----

# SUBMISSION

In [None]:
X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import accuracy_score

# lgb = LGBMClassifier(random_state=42)

# param_grid_lgb = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 7, 9],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.6, 0.8, 1.0]
# }

# lgb_random_search = RandomizedSearchCV(
#     estimator=lgb,
#     param_distributions=param_grid_lgb,
#     n_iter=10,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

# lgb_random_search.fit(X_train, y_train)

In [None]:
redundant_features = ['IsFlightsDisabled', 'IsBetaUser','AutoSampleSubmissionEnabled','DeviceFamily','MachineID']

for feature in redundant_features :
    if feature in X_test.columns :
        X_test.drop(columns=feature, inplace=True)

X_test.shape

(10000, 70)

In [None]:
# X_test['DateAS'] = pd.to_datetime(X_test['DateAS'])
# X_test['DateOS'] = pd.to_datetime(X_test['DateOS'])

In [None]:
# Fill missing values in test set
for col in numerical_col:
    X_test[col].fillna(df[col].mean(), inplace=True)

for col in categorical_col:
    X_test[col].fillna(df[col].mode()[0], inplace=True)

# Scale numerical features in test set
X_test[numerical_col] = scaler.transform(X_test[numerical_col])

# Encode categorical features in test set (handle unseen values)
for col in categorical_col:
    X_test[col] = X_test[col].apply(lambda x: x if x in encoders[col].classes_ else X_train[col].mode()[0])
    encoders[col].classes_ = np.append(encoders[col].classes_, X_train[col].mode()[0]) if X_train[col].mode()[0] not in encoders[col].classes_ else encoders[col].classes_
    X_test[col] = encoders[col].transform(X_test[col])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.ensemble import StackingClassifier

stack = StackingClassifier(
    estimators=[
        ('rf', rf_best_model),
        ('lgb',lgb_best_model),
        ('xgb',xgb_best_model),
        ('ada',ada_best_model)],

    final_estimator=LogisticRegression()
)

stack.fit(X_train, y_train)

y_pred_stack = stack.predict(X_val)

print(f"Accuracy: {accuracy_score(y_val, y_pred_stack)}")

[LightGBM] [Info] Number of positive: 40420, number of negative: 39580
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4789
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505250 -> initscore=0.021001
[LightGBM] [Info] Start training from score 0.021001
[LightGBM] [Info] Number of positive: 32336, number of negative: 31664
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4723
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 70
[LightGBM] [Info] [b

In [None]:
y_pred = stack.predict(X_test)
y_pred

array([1, 0, 1, ..., 0, 0, 0])

In [None]:
y_pred

array([1, 0, 1, ..., 0, 0, 0])

In [None]:
submission = pd.DataFrame({"id":range(0,test_data.shape[0]),
                        "target": y_pred})
submission.to_csv('submission.csv', index= False)