In [2]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from ydata_profiling import ProfileReport

In [3]:
# Load the dataset
df = pd.read_csv('Microsoft_malware_dataset_min.csv')
df.head()

Unnamed: 0,HasDetections,Wdft_IsGamer,Census_IsVirtualDevice,Census_OSEdition,Census_HasOpticalDiskDrive,Firewall,SMode,IsProtected,OsPlatformSubRelease,CountryIdentifier
0,0,0.0,0.0,Professional,0,1.0,0.0,1.0,rs4,29
1,0,0.0,0.0,Professional,0,1.0,0.0,1.0,rs4,93
2,0,0.0,0.0,Core,0,1.0,0.0,1.0,rs4,86
3,1,0.0,0.0,Professional,0,1.0,0.0,1.0,rs4,88
4,1,0.0,0.0,Core,0,1.0,0.0,1.0,rs4,18


### basic data exploration

In [4]:
df.shape

(100000, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   HasDetections               100000 non-null  int64  
 1   Wdft_IsGamer                96585 non-null   float64
 2   Census_IsVirtualDevice      99815 non-null   float64
 3   Census_OSEdition            100000 non-null  object 
 4   Census_HasOpticalDiskDrive  100000 non-null  int64  
 5   Firewall                    98924 non-null   float64
 6   SMode                       94068 non-null   float64
 7   IsProtected                 99609 non-null   float64
 8   OsPlatformSubRelease        100000 non-null  object 
 9   CountryIdentifier           100000 non-null  int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 7.6+ MB


In [6]:
df.isnull().sum()

HasDetections                    0
Wdft_IsGamer                  3415
Census_IsVirtualDevice         185
Census_OSEdition                 0
Census_HasOpticalDiskDrive       0
Firewall                      1076
SMode                         5932
IsProtected                    391
OsPlatformSubRelease             0
CountryIdentifier                0
dtype: int64

In [7]:
# Generate Pandas Profiling Report
profile = ProfileReport(df, title="Malware Dataset Profiling Report")
profile.to_file("malware_profile_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
df.describe()

Unnamed: 0,HasDetections,Wdft_IsGamer,Census_IsVirtualDevice,Census_HasOpticalDiskDrive,Firewall,SMode,IsProtected,CountryIdentifier
count,100000.0,96585.0,99815.0,100000.0,98924.0,94068.0,99609.0,100000.0
mean,0.49928,0.281607,0.006642,0.07585,0.977326,0.000351,0.944864,108.18805
std,0.500002,0.449785,0.08123,0.264759,0.148863,0.018727,0.228246,62.989406
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,1.0,51.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,1.0,97.0
75%,1.0,1.0,0.0,0.0,1.0,0.0,1.0,162.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,222.0


In [9]:
# Handle Missing Values
df.fillna({'Wdft_IsGamer': 0}, inplace=True)
df.fillna({'SMode': 0}, inplace=True)

In [10]:
# Remove duplicates
df_cleaned = df.drop_duplicates()
print("\nDuplicates removed:", df.shape[0] - df_cleaned.shape[0])


Duplicates removed: 83371


In [11]:
# Encode categorical features
le = LabelEncoder()
categorical_columns = ['Census_OSEdition', 'OsPlatformSubRelease']
for col in categorical_columns:
    df_cleaned[col] = le.fit_transform(df_cleaned[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = le.fit_transform(df_cleaned[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = le.fit_transform(df_cleaned[col])


In [12]:
df_cleaned.head()

Unnamed: 0,HasDetections,Wdft_IsGamer,Census_IsVirtualDevice,Census_OSEdition,Census_HasOpticalDiskDrive,Firewall,SMode,IsProtected,OsPlatformSubRelease,CountryIdentifier
0,0,0.0,0.0,11,0,1.0,0.0,1.0,4,29
1,0,0.0,0.0,11,0,1.0,0.0,1.0,4,93
2,0,0.0,0.0,1,0,1.0,0.0,1.0,4,86
3,1,0.0,0.0,11,0,1.0,0.0,1.0,4,88
4,1,0.0,0.0,1,0,1.0,0.0,1.0,4,18


In [29]:
df_cleaned.isnull().sum()

HasDetections                   0
Wdft_IsGamer                    0
Census_IsVirtualDevice        152
Census_OSEdition                0
Census_HasOpticalDiskDrive      0
Firewall                      452
SMode                           0
IsProtected                   339
OsPlatformSubRelease            0
CountryIdentifier               0
dtype: int64

In [30]:
# Handle Missing Values
df_cleaned.fillna({'Census_IsVirtualDevice': 0}, inplace=True)
df_cleaned.fillna({'Firewall': 0}, inplace=True)
df_cleaned.fillna({'IsProtected': 0}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.fillna({'Census_IsVirtualDevice': 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.fillna({'Firewall': 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.fillna({'IsProtected': 0}, inplace=True)


In [31]:
# Prepare for modeling
x = df_cleaned.drop('HasDetections', axis=1)
y = df_cleaned['HasDetections']

In [32]:
# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [33]:
# Train Decision Tree and plot ROC curve
plt.figure(figsize=(10, 6))
for max_depth in [3, 5, 7, 10]:
    dt = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    dt.fit(x_train, y_train)
    y_pred_proba = dt.predict_proba(x_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label=f'ROC curve (max_depth={max_depth}) (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Decision Tree Depths')
plt.legend(loc="lower right")
plt.savefig('roc_curves.png')
plt.close()

## Unsupervised Learning

In [34]:
# Prepare data for clustering
x_cluster = df_cleaned.drop('HasDetections', axis=1)
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_cluster)

In [35]:
# Find optimal K using elbow method
inertias = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(x_scaled)
    inertias.append(kmeans.inertia_)

In [38]:
# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.savefig('elbow_curve.png')
plt.close()

In [39]:
# Apply K-means with optimal K
optimal_k = 4  # Based on elbow curve
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(x_scaled)

In [42]:
 # Visualizing clusters using first two features
plt.figure(figsize=(10, 6))
scatter = plt.scatter(x_scaled[:, 0], x_scaled[:, 1], c=clusters, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('K-means Clustering Results')
plt.colorbar(scatter)
plt.savefig('K-means cluster')
plt.close()