In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from tabulate import tabulate
import warnings

In [2]:
# Importing the dataset
dataset1=pd.read_csv("Wine.csv",index_col=None)

df2=dataset1

df2 = pd.get_dummies(df2,dtype=int,drop_first=True)

indep_x = df2.iloc[:, 0:13].values
dep_y= df2.iloc[:, 13].values

In [3]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size=0.3, random_state=42)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Initialize classifiers
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'SVMNL': SVC(kernel = 'rbf', random_state = 0),
    'NAVIE' : GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}


In [6]:
# Initialize dictionary to store results
results = {'Model': [], 'Without PCA Accuracy': [], 'With PCA Accuracy': [], 'Before PCA Columns': [], 'After PCA Columns': [], 'PCA Components': [], 'EXP VAR' :[]}

# Perform feature selection and evaluation for each classifier
for name, clf in classifiers.items():
    # Fit the PCA model
    pca = PCA(n_components=5)
    X_train_pca = pca.fit_transform(X_train_scaled)  # Apply PCA on the training data
    X_test_pca = pca.transform(X_test_scaled)        # Apply PCA on the test data
    explained_variance = pca.explained_variance_ratio_
    
    # Get the selected components
    selected_comp = [f'PC{i+1}' for i in range(pca.n_components_)]
    
    # Store PCA components in the results
    results['Model'].append(name)
    results['PCA Components'].append(', '.join(selected_comp))  # Join the components as a string
    results['Before PCA Columns'].append(X_train_scaled.shape[1])  # Number of columns before PCA (13)
    results['After PCA Columns'].append(X_train_pca.shape[1])  # Number of columns after PCA (5)
    results['EXP VAR'].append(', '.join([f'{var:.4f}' for var in explained_variance]))  # Format each value
    
    # Train and evaluate the model without PCA (on original data)
    clf.fit(X_train_scaled, y_train)
    accuracy_before_pca = clf.score(X_test_scaled, y_test)  # Evaluate on the test set with 13 features
    
    # Train and evaluate the model with PCA (on transformed data)
    clf.fit(X_train_pca, y_train)
    accuracy_after_pca = clf.score(X_test_pca, y_test)  # Evaluate on the test set with 5 features
    
    # Store accuracies in the results
    results['Without PCA Accuracy'].append(accuracy_before_pca)
    results['With PCA Accuracy'].append(accuracy_after_pca)

# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Display the results in a nice table format using tabulate
print("\nSummary of Model Accuracies with PCA Components:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False))



Summary of Model Accuracies with PCA Components:
+---------------------+----------------------+--------------------+--------------------+-------------------+-------------------------+----------------------------------------+
|        Model        | Without PCA Accuracy | With PCA Accuracy  | Before PCA Columns | After PCA Columns |     PCA Components      |                EXP VAR                 |
+---------------------+----------------------+--------------------+--------------------+-------------------+-------------------------+----------------------------------------+
|         KNN         |  0.9629629629629629  | 0.9629629629629629 |         13         |         5         | PC1, PC2, PC3, PC4, PC5 | 0.3620, 0.1876, 0.1166, 0.0758, 0.0704 |
|    Random Forest    |         1.0          | 0.9629629629629629 |         13         |         5         | PC1, PC2, PC3, PC4, PC5 | 0.3620, 0.1876, 0.1166, 0.0758, 0.0704 |
|         SVM         |  0.9814814814814815  | 0.9814814814814815 |   