In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [2]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [3]:
def pcavariance(indep_X,dep_Y,n):
        X_train, X_test, y_train, y_test=split_scalar(indep_X,dep_Y)   
        from sklearn.decomposition import KernelPCA
        kpca = KernelPCA(n_components = 2, kernal = 'rbf')
        X_train = kpca.fit_transform(X_train)
        X_test = kpca.transform(X_test)
        explained_variance = kpca.explained_variance_ratio_
        return explained_variance

In [4]:
def cm_prediction(classifier, X_test, y_test): # <-- ADDED y_test
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    from sklearn.metrics import accuracy_score 
    from sklearn.metrics import classification_report 
    
    Accuracy = accuracy_score(y_test, y_pred)
    
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, X_test, y_test, cm

In [5]:
def logistic(X_train, y_train, X_test, y_test):
    # --- RESTORED MODEL INITIALIZATION AND FITTING ---
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0) # <--- 'classifier' assigned here
    classifier.fit(X_train, y_train)
    # --- END RESTORATION ---

    # Now 'classifier' is defined and can be passed to cm_prediction
    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [6]:
def svm_linear(X_train, y_train, X_test, y_test): # Added y_test
    # --- RESTORED MODEL INITIALIZATION AND FITTING ---
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0) # 'classifier' assigned
    classifier.fit(X_train, y_train)
    # --- END RESTORATION ---
    
    # Passing y_test to cm_prediction 
    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test) 
    return classifier, Accuracy, report, X_test, y_test, cm

In [7]:
def svm_NL(X_train,y_train,X_test, y_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test) 
        return classifier, Accuracy, report, X_test, y_test, cm

In [8]:
def Navie(X_train,y_train,X_test, y_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test) 
        return classifier, Accuracy, report, X_test, y_test, cm    

In [9]:
def knn(X_train,y_train,X_test, y_test):
           
        # Fitting K-NN to the Training set
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test, y_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [10]:
def Decision(X_train,y_train,X_test, y_test):
        
        # Fitting K-NN to the Training set
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test, y_test)
        return  classifier,Accuracy,report,X_test,y_test,cm     

In [11]:
def random(X_train,y_train,X_test, y_test):
        
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test, y_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [12]:
def pca_Classification(n_list, acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    # Index names now reflect the n_components used
    dataframe=pd.DataFrame(index=[f'PCA n={n}' for n in n_list], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    for number, idex in enumerate(dataframe.index):
        # We need to ensure results are floats, otherwise pandas may coerce them unexpectedly
        dataframe['Logistic'][idex] = float(acclog[number])
        dataframe['SVMl'][idex] = float(accsvml[number])
        dataframe['SVMnl'][idex] = float(accsvmnl[number])
        dataframe['KNN'][idex] = float(accknn[number])
        dataframe['Navie'][idex] = float(accnav[number])
        dataframe['Decision'][idex] = float(accdes[number])
        dataframe['Random'][idex] = float(accrf[number])
    return dataframe

In [13]:
dataset1=pd.read_csv("CKD.csv",index_col=None)

In [14]:
df2=dataset1

df2 = pd.get_dummies(df2, drop_first=True)

In [15]:
indep_X=df2.drop('classification_yes',axis= 1)
dep_Y=df2['classification_yes']

In [16]:
n_list = [3, 4, 5]
pca_results = []

#X_train, X_test, y_train, y_test = split_scalar(indep_X, dep_Y)

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [17]:
for n in n_list:
    print(f"--- Running PCA classification for n_components={n} ---")
    
    # 2. Split and Scale (inside loop to reset for new PCA)
    X_train, X_test, y_train, y_test = split_scalar(indep_X, dep_Y)

    # 3. Apply PCA with the current 'n'
    from sklearn.decomposition import KernelPCA
    kpca = KernelPCA(n_components = n)
    X_train_kpca = kpca.fit_transform(X_train)
    X_test_kpca = kpca.transform(X_test)
    
    # 4. Run Classifiers and store accuracy (passing y_test)
    
    # Logistic Regression
    _, Accuracy, _, _, _, _ = logistic(X_train_kpca, y_train, X_test_kpca, y_test)
    acclog.append(Accuracy)

    # SVM Linear
    _, Accuracy, _, _, _, _ = svm_linear(X_train_kpca, y_train, X_test_kpca, y_test)
    accsvml.append(Accuracy)

    # SVM Non-linear (RBF)
    _, Accuracy, _, _, _, _ = svm_NL(X_train_kpca, y_train, X_test_kpca, y_test)
    accsvmnl.append(Accuracy)

    # KNN
    _, Accuracy, _, _, _, _ = knn(X_train_kpca, y_train, X_test_kpca, y_test)
    accknn.append(Accuracy)

    # Naive Bayes
    _, Accuracy, _, _, _, _ = Navie(X_train_kpca, y_train, X_test_kpca, y_test)
    accnav.append(Accuracy)

    # Decision Tree
    _, Accuracy, _, _, _, _ = Decision(X_train_kpca, y_train, X_test_kpca, y_test)
    accdes.append(Accuracy)

    # Random Forest
    _, Accuracy, _, _, _, _ = random(X_train_kpca, y_train, X_test_kpca, y_test)
    accrf.append(Accuracy)

--- Running PCA classification for n_components=3 ---


  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


--- Running PCA classification for n_components=4 ---
--- Running PCA classification for n_components=5 ---


  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


In [18]:
result = pca_Classification(n_list, acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
print("\nPCA Classification Results Matrix:")
print(result)


PCA Classification Results Matrix:
        Logistic  SVMl SVMnl   KNN Navie Decision Random
PCA n=3     0.97     1  0.99  0.99  0.99     0.99   0.99
PCA n=4     0.98  0.99     1     1  0.99     0.99      1
PCA n=5     0.98  0.99     1  0.99     1     0.99      1
