In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [23]:
# Function for splitting the data and scaling it
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [7]:
# Function to apply LDA and return the transformed data
'''def apply_lda(X_train, y_train, X_test, n_components=2):
    lda = LDA(n_components=n_components)
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)
    return X_train_lda, X_test_lda'''

In [30]:
# Function to perform LDA and split the dataset
def split_lda(indep_X, dep_Y, n_components=2):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    #splits the dataset into training and testing sets, with 25% of the data allocated to the test set
    sc = StandardScaler()#standardizes the features by removing the mean and scaling to unit variance.
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    lda = LDA(n_components=n_components)
    X_train_lda = lda.fit_transform(X_train,y_train)
    X_test_lda = lda.transform(X_test)
    #X_train is fitted and transformed using sc.fit_transform(X_train), 
    #and X_test is transformed using the scaler fitted on the training data with sc.transform(X_test).
    return X_train_lda, X_test_lda, y_train, y_test

In [25]:
# Function for evaluating the model
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test) #uses the trained classifier to predict the labels for the test set (X_test).
    cm = confusion_matrix(y_test, y_pred) #calculates the confusion matrix, which shows the counts of true positive, true negative, false positive, and false negative predictions.
    Accuracy = accuracy_score(y_test, y_pred) #computes the accuracy of the model, which is the ratio of correctly predicted instances to the total instances.
    report = classification_report(y_test, y_pred) #generates a text report showing the main classification metrics, including precision, recall, and F1-score for each class.
    return classifier, Accuracy, report, cm

In [26]:
# Classification models
def logistic(X_train, y_train, X_test, y_test):       
    classifier = LogisticRegression(random_state=0) #nitializes a logistic regression model. The random_state=0 ensures that the results
    #are reproducible.
    classifier.fit(X_train, y_train) #trains the logistic regression model on the training data (X_train and y_train).
    return cm_prediction(classifier, X_test, y_test)
    #returns the results of cm_prediction(classifier, X_test, y_test), which evaluates the trained model on the test data (X_test and y_test) 
    #and provides the classifier, accuracy score, classification report, and confusion matrix.

def svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0) 
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_nl(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def navie(X_train, y_train, X_test, y_test):       
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def decision(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [27]:
# Function to store the accuracy results in a dataframe
#Inputs:

#acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf: These are lists containing the accuracy scores for Logistic Regression, 
#SVM (linear and non-linear), KNN, Naive Bayes, Decision Tree, and Random Forest classifiers, respectively.
#n_components: The number of PCA components used.
#A DataFrame is initialized with the index labeled as PCA-{n_components}, where {n_components} represents the number of components.
#The columns correspond to the different classifiers.
def lda_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf, n_components): 
    dataframe = pd.DataFrame(index=[f'LDA-{n_components}'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    dataframe['Logistic'] = acclog       
    dataframe['SVMl'] = accsvml
    dataframe['SVMnl'] = accsvmnl
    dataframe['KNN'] = accknn
    dataframe['Navie'] = accnav
    dataframe['Decision'] = accdes
    dataframe['Random'] = accrf
    return dataframe

In [28]:
# Loading the Wine dataset
dataset = pd.read_csv("Wine.csv")
indep_X = dataset.iloc[:, 0:13].values
dep_Y = dataset.iloc[:, 13].values

In [12]:
print(indep_X)

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [13]:
print(dep_Y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [29]:
results = []

# Looping through LDA components from 1 to 3
for n_components in range(1, 3):
    acclog = []
    accsvml = []
    accsvmnl = []
    accknn = []
    accnav = []
    accdes = []
    accrf = []

    X_train_lda, X_test_lda, y_train, y_test = split_lda(indep_X, dep_Y, n_components=n_components)

    # Applying classification models
    classifier, Accuracy, report, cm = logistic(X_train_lda, y_train, X_test_lda, y_test)
    acclog.append(Accuracy)

    classifier, Accuracy, report, cm = svm_linear(X_train_lda, y_train, X_test_lda, y_test)
    accsvml.append(Accuracy)

    classifier, Accuracy, report, cm = svm_nl(X_train_lda, y_train, X_test_lda, y_test)
    accsvmnl.append(Accuracy)

    classifier, Accuracy, report, cm = knn(X_train_lda, y_train, X_test_lda, y_test)
    accknn.append(Accuracy)

    classifier, Accuracy, report, cm = navie(X_train_lda, y_train, X_test_lda, y_test)
    accnav.append(Accuracy)

    classifier, Accuracy, report, cm = decision(X_train_lda, y_train, X_test_lda, y_test)
    accdes.append(Accuracy)

    classifier, Accuracy, report, cm = random(X_train_lda, y_train, X_test_lda, y_test)
    accrf.append(Accuracy)

    # Storing the results for each n_components
    result = lda_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf, n_components)
    results.append(result)

# Combine all results into a single DataFrame
final_results = pd.concat(results)
print("LDA Results for different components:\n", final_results)


LDA Results for different components:
        Logistic      SVMl     SVMnl       KNN     Navie  Decision    Random
LDA-1  0.866667  0.866667  0.844444  0.866667  0.866667  0.933333  0.911111
LDA-2  1.000000  0.977778  1.000000  0.977778  0.977778  0.977778  0.977778
