In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load features, features are in the second column and onwards, labels are in the first column
df = pd.read_csv("extracted_features.csv")

# label present in the first column, features pressent in all other columns
X = df.iloc[:, 1:].values   
y = df.iloc[:, 0].values  

# Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into 80:20 train-test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import numpy as np
from scipy.stats import norm
from sklearn.model_selection import train_test_split

# PCA Implementation
def compute_pca(X, variance_retained=0.95):
    # Standardize the data
    X_mean = X - np.mean(X, axis=0)
    # Compute covariance matrix
    covariance_matrix = np.cov(X_mean, rowvar=False)
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Compute cumulative variance
    cumulative_variance = np.cumsum(eigenvalues) / np.sum(eigenvalues)
    
    # Find the number of components to keep 95% variance
    num_components = np.argmax(cumulative_variance >= variance_retained) + 1
    
    # Select top components
    principal_components = eigenvectors[:, :num_components]
    
    # Project data
    X_pca = np.dot(X_mean, principal_components)
    return X_pca, principal_components

In [None]:
# Naïve Bayes Implementation
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.means = {}
        self.stds = {}
        self.priors = {}
        
        for cls in self.classes:
            X_cls = X[y == cls]
            self.means[cls] = np.mean(X_cls, axis=0)
            self.stds[cls] = np.std(X_cls, axis=0) + 1e-6  # Avoid division by zero, hence add a small constant
            # Calculate prior probability
            self.priors[cls] = X_cls.shape[0] / X.shape[0]
    
    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.priors[cls])
                likelihood = np.sum(norm.logpdf(x, self.means[cls], self.stds[cls]))
                posteriors.append(prior + likelihood)
            predictions.append(self.classes[np.argmax(posteriors)])
        return np.array(predictions)


In [9]:
# Example dataset (Replace with actual scaled dataset)
np.random.seed(42)
X_scaled = np.random.rand(100, 5)  # 100 samples, 5 features
y_encoded = np.random.randint(0, 2, 100)  # Binary classification

# Apply PCA
X_pca, _ = compute_pca(X_scaled, variance_retained=0.95)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Train Naïve Bayes Model
nb_model = NaiveBayes()
nb_model.fit(X_train, y_train)

# Predict
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

# Calculate accuracy
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

accuracy_train = accuracy(y_train, y_train_pred)
accuracy_test = accuracy(y_test, y_test_pred)

print(f"PCA + Naïve Bayes Train Accuracy: {accuracy_train:.4f}")
print(f"PCA + Naïve Bayes Test Accuracy: {accuracy_test:.4f}")

PCA + Naïve Bayes Train Accuracy: 0.6625
PCA + Naïve Bayes Test Accuracy: 0.5500


In [16]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensions while keeping 95% variance
pca = PCA(n_components=0.95)  
X_pca = pca.fit_transform(X_scaled)

# Split data again
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Train Naïve Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred = nb_model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score

# Predict on training data
y_train_pred = nb_model.predict(X_train)

# Predict on test data
y_test_pred = nb_model.predict(X_test)

# Calculate accuracy
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print(f"PCA + Naïve Bayes Train Accuracy: {accuracy_train:.4f}")
print(f"PCA + Naïve Bayes Test Accuracy: {accuracy_test:.4f}")


PCA + Naïve Bayes Train Accuracy: 0.7030
PCA + Naïve Bayes Test Accuracy: 0.4557
