# Binary Classification with a Software Defects Dataset
Playground Series - Season 3, Episode 23
https://www.kaggle.com/competitions/playground-series-s3e23/data

In [1]:
# Libraries
import numpy as np
import pandas as pd

from preprocessing import read_train, SMOTE_data, plot_dist

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

### Data

In [2]:
X, y = read_train()
X, y = SMOTE_data(X, y)

In [4]:
from collections import Counter
(counter := Counter(y))

Counter({False: 65608, True: 43739})

In [5]:
# As discover in the EDA, we will log transform all X for better distribution
log_X = np.log(X + .001)

In [6]:
# Transform data into Standard scale
ss = StandardScaler()
fit_X = ss.fit_transform(log_X)

In [7]:
# 
xtrain, xtest, ytrain, ytest = train_test_split(fit_X, y, test_size=.3)

### Models

In [8]:
# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

rs = 93

In [12]:
# SVM
svc = svm.SVC().fit(xtrain, ytrain)
svc_pred = svc.predict(xtest)
print("Accuracy:", accuracy_score(ytest, svc_pred))
print("ROC/AUC Score:", roc_auc_score(ytest, svc_pred))

Accuracy: 0.738515470202713
ROC/AUC Score: 0.7082073542645551


In [13]:
# RandomForest
rf = RandomForestClassifier(n_estimators=200).fit(xtrain, ytrain)
rf_pred = rf.predict(xtest)
print("Accuracy:", accuracy_score(ytest, rf_pred))
print("ROC/AUC Score:", roc_auc_score(ytest, rf_pred))

In [10]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5).fit(xtrain, ytrain)
knn_pred = knn.predict(xtest)
print("Accuracy:", accuracy_score(ytest, knn_pred))
print("ROC/AUC Score:", roc_auc_score(ytest, knn_pred))

Accuracy: 0.7387898186252095
ROC/AUC Score: 0.7213182226084673


In [9]:
# Logistic Regression
lr = LogisticRegression(random_state=rs, max_iter=1000).fit(xtrain, ytrain)
lr_pred = lr.predict(xtest)
print("Accuracy:", accuracy_score(ytest, lr_pred))
print("ROC/AUC Score:", roc_auc_score(ytest, lr_pred))

Accuracy: 0.7347050754458162
ROC/AUC Score: 0.7064375084555259


### DefectNet