# LAB 03 machine learning

In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("iris.data", header=None)
print(df.head())
print(df.shape)

     0    1    2    3            4
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
(150, 5)


In [15]:
random_data = df.sample(frac=0.2, random_state=42)
X_test = np.array(random_data.values[:,:4])
y_test = np.array(random_data.values[:,4])
random_data = df.sample(frac=0.8, random_state=42)
X_train = np.array(random_data.values[:,:4])
y_train = np.array(random_data.values[:,4])

print(X_train.size, X_test.size)
print(y_train.size, y_test.size)

480 120
120 30


In [16]:
class KNearestNeighbors:
    def __init__(self , k, distance_metric="euclidean"):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self , X, y):
        """
        Store the 'prior knowledge 'of you model that will be used
        to predict new labels.
        :param X : input data points , ndarray , shape = (R,C).
        :param y : input labels , ndarray , shape = (R,).
        """
        self.X_train = X
        self.y_train = y

    def calculate_distance(self, p:np.array, q:np.array): 
        if self.distance_metric == "euclidean":
            return np.sqrt(np.sum((p-q)**2))
        elif self.distance_metric == "cosine":
            cosine = np.sum(p*q) / (np.sqrt(np.sum(p**2)) * np.sqrt(np.sum(q**2)))
            return 1 - np.abs(cosine)
        elif self.distance_metric == "manhattan":
            return np.sum(np.abs(p-q))
        else:
            raise TypeError

    def predict(self , X):
        """
        Run the KNN classification on X.
        :param X: input data points , ndarray , shape = (N,C).
        :return: labels : ndarray , shape = (N,).
        """
        N, C = X.shape
        predictions = []
        for i in range(N):
            distance = np.array([
                self.calculate_distance(X[i],row_train)
                for row_train in self.X_train])
            neighbors_idx = np.argsort(distance)[:self.k]
            neighbor_labels = self.y_train[neighbors_idx]
            unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
            majority_label = unique_labels[np.argmax(counts)]
            predictions.append(majority_label)
        
        return np.array(predictions)


In [17]:
KNN = KNearestNeighbors(k=4, distance_metric="euclidean")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Euclidean Accuracy:", accuracy_percent, "%")

Euclidean Accuracy: 100.0 %


In [18]:
KNN = KNearestNeighbors(k=4, distance_metric="cosine")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Cosine Accuracy:", accuracy_percent, "%")

Cosine Accuracy: 96.66666666666667 %


In [19]:
KNN = KNearestNeighbors(k=4, distance_metric="manhattan")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Manhattan Accuracy:", accuracy_percent, "%")

Manhattan Accuracy: 100.0 %


In [20]:
KNN = KNearestNeighbors(k=6, distance_metric="cosine")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Cosine Accuracy:", accuracy_percent, "%")

Cosine Accuracy: 100.0 %


In [21]:
df = pd.read_csv("mnist_test.csv", header=None)
print(df.head())
print(df.shape)

random_data = df.sample(frac=0.2, random_state=42)
X_test = np.array(random_data.values[:,:4])
y_test = np.array(random_data.values[:,4])
random_data = df.sample(frac=0.8, random_state=42)
X_train = np.array(random_data.values[:,:4])
y_train = np.array(random_data.values[:,4])

print(X_train.size, X_test.size)
print(y_train.size, y_test.size)

   0    1    2    3    4    5    6    7    8    9    ...  775  776  777  778  \
0    7    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
1    2    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
2    1    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
3    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
4    4    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   

   779  780  781  782  783  784  
0    0    0    0    0    0    0  
1    0    0    0    0    0    0  
2    0    0    0    0    0    0  
3    0    0    0    0    0    0  
4    0    0    0    0    0    0  

[5 rows x 785 columns]
(10000, 785)
32000 8000
8000 2000


In [22]:
KNN = KNearestNeighbors(k=4, distance_metric="euclidean")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Euclidean Accuracy on different dataset:", accuracy_percent, "%")

Euclidean Accuracy on different dataset: 100.0 %


In [23]:
KNN = KNearestNeighbors(k=6, distance_metric="cosine")
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
tp = 0
for i in range(y_pred.size):
    if y_pred[i] == y_test[i]:
        tp = tp + 1

accuracy_percent = (tp / y_pred.size) * 100
print("Cosine Accuracy on different dataset:", accuracy_percent, "%")

  cosine = np.sum(p*q) / (np.sqrt(np.sum(p**2)) * np.sqrt(np.sum(q**2)))


Cosine Accuracy on different dataset: 100.0 %
