In [1]:
import pandas as pd

In [2]:
X = pd.read_csv('encoded_X.csv')
y = pd.read_csv('encoded_y.csv')

In [4]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [4]:
# List of columns that are categorical and currently of type object
categorical_cols = [
    'Processor', 'OsPlatformSubRelease', 'SkuEdition', 'SmartScreen', 
    'Census_MDC2FormFactor', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName',
    'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSArchitecture',
    'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName',
    'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel',
    'Census_FlightRing'
]

# Convert the object columns to categorical type
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')

In [5]:
# Split the dataset into 80% train and 20% test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np

In [8]:
from sklearn.preprocessing import OrdinalEncoder
 
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
scaler = StandardScaler()
 
oe.fit(X_train)

X_train = oe.transform(X_train)
X_val = oe.transform(X_val)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [9]:
model_xgb = XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='logloss', enable_categorical=True)
model_xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [10]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_train = X_train.fillna(-1)
X_val = pd.DataFrame(X_val, columns=X.columns)
X_val = X_val.fillna(-1)

In [11]:
# Retrieve feature importances
importances_xgb = model_xgb.feature_importances_
feat_importances_xgb = pd.Series(importances_xgb, index=X_train.columns).sort_values(ascending=False)

# Display top 10 feature importances
print(feat_importances_xgb.sort_values(ascending=False))
print(sum(feat_importances_xgb))

AVProductsInstalled        0.150720
SmartScreen                0.140838
AvSigVersion_1             0.059362
Census_TotalPhysicalRAM    0.036035
Processor                  0.030408
                             ...   
OsSuite_bit4               0.000000
OsSuite_bit7               0.000000
OsSuite_bit5               0.000000
OsSuite_bit6               0.000000
OsBuildLab_part3           0.000000
Length: 83, dtype: float32
1.000000007392373


In [None]:
# Select features based on a threshold.
# For XGBoost, a threshold like 0.01 can work, but you may need to adjust based on your data.
top_feats_xgb = feat_importances_xgb[feat_importances_xgb > 0.001].index

# Reduce the training and validation sets to only the selected features
X_train = X_train[top_feats_xgb]
X_val = X_val[top_feats_xgb]

In [17]:
y_train.shape
y_val.shape

(1784297, 1)

In [12]:
model_xgb.fit(X_train, y_train)
labels = model_xgb.predict(X_train)
print("Training Accuracy: ", accuracy_score(y_train, labels))
test_labels = model_xgb.predict(X_val)
print("Validation Accuracy: ", accuracy_score(y_val, test_labels))

Parameters: { "use_label_encoder" } are not used.



Training Accuracy:  0.6682572094940499
Validation Accuracy:  0.6646836261003634


In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
labels = logreg.predict(X_train)
print(accuracy_score(y_train, labels))
test_labels = logreg.predict(X_val)
print(accuracy_score(y_val, test_labels))

0.608634411377257
0.6085068797403123


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

In [23]:
model = Sequential()

model.add(Dense(83, activation='relu', input_shape=(X_train.shape[1],)))

# Hidden layers
model.add(Dense(64, activation='relu'))

model.add(Dense(32, activation='relu'))

# Output layer: 1 neuron with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=256,
                    validation_split=0.2,
                    verbose=1)

# Predictions
y_train_pred = (model.predict(X_train) > 0.5).astype("int32")
y_test_pred = (model.predict(X_val) > 0.5).astype("int32")

# Accuracy on train and test data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_val, y_test_pred)
 
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 709us/step - accuracy: 0.6329 - loss: 0.6325 - val_accuracy: 0.6449 - val_loss: 0.6202
Epoch 2/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 680us/step - accuracy: 0.6465 - loss: 0.6193 - val_accuracy: 0.6480 - val_loss: 0.6182
Epoch 3/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 660us/step - accuracy: 0.6489 - loss: 0.6172 - val_accuracy: 0.6491 - val_loss: 0.6169
Epoch 4/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 682us/step - accuracy: 0.6509 - loss: 0.6154 - val_accuracy: 0.6509 - val_loss: 0.6153
Epoch 5/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 664us/step - accuracy: 0.6520 - loss: 0.6144 - val_accuracy: 0.6513 - val_loss: 0.6147
Epoch 6/10
[1m22304/22304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 672us/step - accuracy: 0.6529 - loss: 0.6134 - val_accuracy: 0.6528 - val

In [17]:
import tensorflow as tf
from tensorflow.keras import layers, models # type: ignore
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
 
model = Sequential()
 
model.add(layers.InputLayer(input_shape=(X_train.shape[1],)))
 
model.add(layers.Dense(64, activation='relu'))
 
model.add(layers.Dense(1, activation='sigmoid'))
 
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
 
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))
 
# Predictions
y_train_pred = (model.predict(X_train) > 0.5).astype("int32")
y_test_pred = (model.predict(X_val) > 0.5).astype("int32")
 
# Accuracy on train and test data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_val, y_test_pred)
 
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")



Epoch 1/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 373us/step - accuracy: 0.6337 - loss: 0.6345 - val_accuracy: 0.6419 - val_loss: 0.6264
Epoch 2/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 358us/step - accuracy: 0.6443 - loss: 0.6241 - val_accuracy: 0.6437 - val_loss: 0.6237
Epoch 3/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 357us/step - accuracy: 0.6462 - loss: 0.6220 - val_accuracy: 0.6474 - val_loss: 0.6210
Epoch 4/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 358us/step - accuracy: 0.6474 - loss: 0.6211 - val_accuracy: 0.6471 - val_loss: 0.6215
Epoch 5/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 357us/step - accuracy: 0.6481 - loss: 0.6206 - val_accuracy: 0.6477 - val_loss: 0.6204
Epoch 6/10
[1m111519/111519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 356us/step - accuracy: 0.6486 - loss: 0.6198 - val_accuracy: 