In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [90]:
# Load the dataset
df = pd.read_csv("first_500_rows.csv")

In [91]:
df.shape

(500, 592)

In [92]:
df.drop(columns = ["Time"], inplace = True)

In [93]:
# Fill remaining missing values with the mean
df.fillna(df.mean(), inplace = True)

In [94]:
# Split dataset into features (X) and target (y)
X = df.drop(columns = ["Pass/Fail"])
y = df["Pass/Fail"]

In [126]:
from sklearn.model_selection import train_test_split
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [127]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((400, 590), (100, 590), (400,), (100,))

In [128]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [129]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### 1. VarianceThreshold (removes low-variance features)

In [131]:
selector = VarianceThreshold()
X_train_var = selector.fit_transform(X_train)
X_test_var = selector.fit_transform(X_test)

In [132]:
print("Shape before Feature selection: ", X_train.shape, X_test.shape)
print("Shape after Feature selection: ", X_train_var.shape, X_test_var.shape)

Shape before Feature selection:  (400, 574) (100, 574)
Shape after Feature selection:  (400, 452) (100, 448)


### 2. SelectKBest (selects top 10 features based on ANOVA F-score)

In [134]:
selector = SelectKBest(score_func = f_classif, k = 10)
X_train_best = selector.fit_transform(X_train, y_train)
X_train_best = selector.transform(X_test)

In [137]:
print("Shape before Feature selection: ", X_train.shape, X_test.shape)
print("Shape after Feature selection: ", X_train_best.shape, X_train_best.shape)

Shape before Feature selection:  (400, 574) (100, 574)
Shape after Feature selection:  (400, 10) (400, 10)


### --------------------- DIMENSIONALITY REDUCTION USING PCA --------------------- #

In [138]:
# Apply PCA to reduce dimensions to 4 components (FIXED: Ensure PCA components do not exceed features)
from sklearn.decomposition import PCA
n_pca_components = 10
pca = PCA(n_components = n_pca_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [141]:
print("Shape before PCA: ", X_train.shape, X_test.shape)
print("Shape after PCA: ", X_train_pca.shape, X_test_pca.shape)

Shape before PCA:  (400, 574) (100, 574)
Shape after PCA:  (400, 10) (100, 10)


In [143]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Train Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_pca, y_train)
y_pred_log = log_reg.predict(X_test_pca)
accuracy_log = accuracy_score(y_test, y_pred_log)
print(f"\nLogistic Regression Accuracy: {accuracy_log:.2f}")


Logistic Regression Accuracy: 0.86
