In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.stats import multivariate_normal

# <u>**Problem statement 1**</u>

### **Reading the data**

In [43]:
data_1=pd.read_csv('iris_train.csv')          # reading the train set
data_2=pd.read_csv('iris_test.csv')           # reading test set
true_classes_1=np.array(data_1['Species'])    # train set Classes
true_classes_2=np.array(data_2['Species'])    # test set Classes
data_1=data_1.drop(data_1.columns[0], axis=1)
data_2=data_2.drop(data_2.columns[0], axis=1)
att_data_1=data_1.drop(['Species'],axis=1)    # train feature data
att_data_2=data_2.drop(['Species'],axis=1)    # test Features data
columns_list=list(att_data_1.columns)         # features list

### **Applying PCA**

In [44]:
def mean_reduced_data(data,type=list):
    sum_list=sum(data)
    len_list=len(data)
    mean_list=sum_list/len_list
    list_1=[]
    for i in data:
        j=i-mean_list
        list_1.append(j)
    return list_1

def reduction(data):
# this is the data which has a mean value of 0 for all attributes
    msd=pd.DataFrame(columns=columns_list)
    for j in columns_list:                               # This loop will give the mean subtracted data
        reduced_feature=list(mean_reduced_data(data[j]))
        msd[j]=reduced_feature
    return msd

mean_sub_train_data=reduction(att_data_1)
mean_sub_test_data=reduction(att_data_2)

# transpose of mean subtrated data matrix
msd_train_transpose=mean_sub_train_data.transpose()  
msd_test_transpose=mean_sub_test_data.transpose()  

def correlation_matrix(data1,data2):        # data1 is mean subtracted data and data 2 is its transpoase

    # caluculating corelation matrix
    array_1=data1.to_numpy()
    array_2=data2.to_numpy()
    covariance_matrix= np.matmul(array_2, array_1)
    return covariance_matrix

train_corr_matrix=correlation_matrix(mean_sub_train_data,msd_train_transpose)
test_corr_matrix=correlation_matrix(mean_sub_test_data,msd_test_transpose)
 
def PCA(data1,data2):              # data1 is covariance matrix,data2 is mean subtracted data

    # eigen analysis
    eigenvalues, eigenvectors = np.linalg.eig(data1)      # a column is a eigen vector
    sel_vectors=eigenvectors[:,0:1].T

    # projection of data from corrected data
    array_3=data2.to_numpy()
    projected_data=np.dot(array_3,sel_vectors.T)

    # converting array to dataframe for better representaion
    reduced_data= pd.DataFrame(projected_data, columns=["PC1"])    # finale data
    return reduced_data,sel_vectors

PC_train,sel_vector_1=PCA(train_corr_matrix,mean_sub_train_data)    # train set (reduced data),selected eigenvectors
PC_test,sel_vector_2=PCA(test_corr_matrix,mean_sub_test_data)       # test set (reduced data),selected eigenvectors
PC_train['Class']=true_classes_1
PC_test['Class']=true_classes_2

### **Calculating gaussian parameters**

In [45]:
train_data = PC_train 
# Get unique classes
classes = train_data['Class'].unique()

# Dictionary to store class-wise parameters
gaussian_params = {}

for c in classes:    # this loop will store the mean and variance of each class from train set
    class_data = train_data[train_data['Class'] == c]['PC1']
    '''print(c)
    print(class_data)'''
    mean = np.mean(class_data)
    variance = np.var(class_data)
    gaussian_params[c] = {'mean': mean, 'variance': variance}

print(gaussian_params)


{'Iris-setosa': {'mean': np.float64(-2.614563160523108), 'variance': np.float64(0.05112070782562285)}, 'Iris-versicolor': {'mean': np.float64(0.5549893709957102), 'variance': np.float64(0.36819932401753164)}, 'Iris-virginica': {'mean': np.float64(2.0981528771820557), 'variance': np.float64(0.48466890188260164)}}


### **Applying baye's classifier**

In [46]:
prior_probabilities = {}
total_samples = len(train_data)

# calculating probability of class
for class_label in classes:
    prior_probabilities[class_label] = len(train_data[train_data['Class'] == class_label]) / total_samples 


# Gaussian likelihood function
def gaussian_likelihood(x, mean, variance):
    return (1 / math.sqrt(2 * math.pi * variance)) * math.exp(-((x - mean) ** 2) / (2 * variance))

# Classify each test sample
test_data = PC_test
predictions = []

for _, row in test_data.iterrows():     # _ is row index and row is data of each column in that row
    sample = row['PC1']
    #print(row)
    max_posterior = -float('inf')
    predicted_class = None

    for c, params in gaussian_params.items():    # c is key and params are value
        likelihood = gaussian_likelihood(sample, params['mean'], params['variance'])
        posterior = likelihood * prior_probabilities[c]
        if posterior > max_posterior:
            max_posterior = posterior
            predicted_class = c

    predictions.append(predicted_class)

test_data['Predicted_Class'] = predictions
print(test_data)


         PC1            Class  Predicted_Class
0   0.734116  Iris-versicolor  Iris-versicolor
1  -2.370212      Iris-setosa      Iris-setosa
2   3.611909   Iris-virginica   Iris-virginica
3   0.629719  Iris-versicolor  Iris-versicolor
4   1.148342  Iris-versicolor  Iris-versicolor
5  -2.586089      Iris-setosa      Iris-setosa
6  -0.357435  Iris-versicolor  Iris-versicolor
7   1.749157   Iris-virginica   Iris-virginica
8   0.751433  Iris-versicolor  Iris-versicolor
9  -0.051086  Iris-versicolor  Iris-versicolor
10  1.486336   Iris-virginica   Iris-virginica
11 -2.972558      Iris-setosa      Iris-setosa
12 -2.800284      Iris-setosa      Iris-setosa
13 -2.857652      Iris-setosa      Iris-setosa
14 -2.760069      Iris-setosa      Iris-setosa
15  0.918746  Iris-versicolor  Iris-versicolor
16  2.170605   Iris-virginica   Iris-virginica
17 -0.146838  Iris-versicolor  Iris-versicolor
18  0.453908  Iris-versicolor  Iris-versicolor
19  1.977330   Iris-virginica   Iris-virginica
20 -2.815783 

### **Confusion matrix**

In [47]:
def confusion_matrix(true_labels, predicted_labels, class_labels):
    # Initialize the confusion matrix with zeros
    matrix = np.zeros((len(class_labels), len(class_labels)), dtype=int)
    
    # Map class labels to indices for matrix positioning
    label_to_index = {label: i for i, label in enumerate(class_labels)}
    
    # Fill the confusion matrix
    for true_label, predicted_label in zip(true_labels, predicted_labels):
        true_index = label_to_index[true_label]
        predicted_index = label_to_index[predicted_label]
        matrix[true_index, predicted_index] += 1
    
    return matrix

def accuracy(true_labels, predicted_labels):

    correct_predictions = sum(t == p for t, p in zip(true_labels, predicted_labels))
    total_samples = len(true_labels)
    return (correct_predictions / total_samples) 


In [48]:
# Confusion matrix
confu_matrix = confusion_matrix(test_data['Class'], test_data['Predicted_Class'],classes)
confusion_daf = pd.DataFrame(confu_matrix, index=classes, columns=classes)
print(confusion_daf)
"""row are actual classes and columns are predicted classes"""
accuracy_score_1 = accuracy(test_data['Class'], test_data['Predicted_Class'])
print(f"Accuracy: {accuracy_score_1 * 100:.2f}%")

                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               10                0               0
Iris-versicolor            0                9               0
Iris-virginica             0                1              10
Accuracy: 96.67%


# <u>**Problem statement 2**</u>

In [49]:

#  Estimate the parameters (mean vector and covariance matrix) for each class
gaussian_params = {}  # Store mean and covariance for each class
prior_probabilities = {}

train_data=data_1
test_data=data_2
classes = train_data['Species'].unique()
total_samples = len(train_data)

for class_label in classes:
    class_data = train_data[train_data['Species'] == class_label].drop('Species', axis=1)  # Drop class column to keep features
    mean_vector = class_data.mean(axis=0).values  # Mean of each feature
    covariance_matrix = np.cov(class_data.T)  # Covariance matrix
    gaussian_params[class_label] = {'mean': mean_vector, 'covariance': covariance_matrix}
    
    # Calculate prior probabilities
    prior_probabilities[class_label] = len(class_data) / total_samples

# Classify each test sample based on likelihood and prior
predictions = []


for _, row in test_data.iterrows():
    sample = row.drop('Species').values  # Get the four-dimensional test sample (without the class label)
    max_posterior = -float('inf')
    predicted_class = None
    
    for class_label, params in gaussian_params.items():
        # Likelihood calculation using scipy.stats.multivariate_normal
        likelihood = multivariate_normal.pdf(sample, mean=params['mean'], cov=params['covariance'])
        
        # Multiply the likelihood by the prior probability
        posterior = likelihood * prior_probabilities[class_label]
        
        # Find the class with the highest posterior
        if posterior > max_posterior:
            max_posterior = posterior
            predicted_class = class_label
    
    # Store the predicted class
    predictions.append(predicted_class)

# Evaluate the model
true_labels = test_data['Species'].values
conf_matrix = confusion_matrix(true_labels, predictions,classes)
confusion_df = pd.DataFrame(conf_matrix, index=classes, columns=classes)
accuracy_score_2 = accuracy(true_labels, predictions)
# Print the confusion matrix and accuracy
print("Confusion Matrix:")
print(confusion_df)
print(f"Accuracy: {accuracy_score_2 * 100:.2f}%")


Confusion Matrix:
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               10                0               0
Iris-versicolor            0                8               1
Iris-virginica             0                0              11
Accuracy: 96.67%


In [50]:
# Estimate the parameters (mean vector and covariance matrix) for each class
gaussian_params = {}  # Store mean and covariance for each class
prior_probabilities = {}

train_data = data_1
test_data = data_2
classes = train_data['Species'].unique()
total_samples = len(train_data)

for class_label in classes:
    class_data = train_data[train_data['Species'] == class_label].drop('Species', axis=1)

    class_data = class_data.apply(pd.to_numeric, errors='coerce')
    
    mean_vector = class_data.mean(axis=0).values  
    covariance_matrix = np.cov(class_data.T)  
    
    gaussian_params[class_label] = {
        'mean': np.array(mean_vector, dtype=np.float64), 
        'covariance': np.array(covariance_matrix, dtype=np.float64)  # Ensure covariance is a float array
    }
    
    prior_probabilities[class_label] = len(class_data) / total_samples

# Define the multivariate Gaussian likelihood function
def multivariate_gaussian_likelihood(x, mean, covariance):
    """ Calculate the likelihood of x under the multivariate Gaussian distribution. """
    n = mean.shape[0]
    diff = x - mean
    
    # Ensure the inputs are float64
    diff = np.array(diff, dtype=np.float64)
    
    # Calculate the exponent
    exponent = -0.5 * np.dot(diff.T, np.linalg.solve(covariance, diff))
    
    determinant = np.linalg.det(covariance)
    likelihood = (1 / np.sqrt((2 * np.pi) ** n * determinant)) * np.exp(exponent)
    return likelihood

# Classify each test sample based on likelihood and prior
predictions = []

for _, row in test_data.iterrows():
    sample = row.drop('Species').values  # Get the four-dimensional test sample (without the class label)
    
    # Ensure sample is a float array
    sample = np.array(sample, dtype=np.float64)
    
    max_posterior = -float('inf')
    predicted_class = None
    
    for class_label, params in gaussian_params.items():
        likelihood = multivariate_gaussian_likelihood(sample, params['mean'], params['covariance'])
        posterior = likelihood * prior_probabilities[class_label]
        
        if posterior > max_posterior:
            max_posterior = posterior
            predicted_class = class_label
    
    predictions.append(predicted_class)

true_labels = test_data['Species'].values

def confusion_matrix(true_labels, predicted_labels, classes):
    matrix = np.zeros((len(classes), len(classes)), dtype=int)
    
    for true, pred in zip(true_labels, predicted_labels):
        true_index = np.where(classes == true)[0][0]
        pred_index = np.where(classes == pred)[0][0]
        matrix[true_index][pred_index] += 1
        
    return matrix

def accuracy(true_labels, predicted_labels):
    correct_predictions = np.sum(true_labels == predicted_labels)
    total_predictions = len(true_labels)
    return correct_predictions / total_predictions

conf_matrix = confusion_matrix(true_labels, predictions, classes)
accuracy_score_2 = accuracy(true_labels, predictions)

confusion_df = pd.DataFrame(conf_matrix, index=classes, columns=classes)

print("Confusion Matrix:")
print(confusion_df)
print(f"Accuracy: {accuracy_score_2 * 100:.2f}%")


Confusion Matrix:
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               10                0               0
Iris-versicolor            0                8               1
Iris-virginica             0                0              11
Accuracy: 96.67%


# <u>**Problem statement 3**</u>

In [51]:
accuracy_diff=math.sqrt(abs((accuracy_score_1-accuracy_score_2)**2)/2)
print(accuracy_diff)

0.0
