<a href="https://colab.research.google.com/github/Ghiles1010/checkpoints/blob/main/Checkpoint_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv("iris.data.txt", header = None)
df.columns =  ["sep_len", "sep_wid", "pet_len", "pet_wid", "category"]

In [16]:
df.head()

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Data preprocessing

Encoding categories

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df["category"] = le.fit_transform(df["category"])

In [18]:
df.head()

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Encoding the rest of the features

In [19]:
features = df.columns.difference(["category"])

for column in features:
    mu = df[column].mean()
    sigma = df[column].std()

    df[column] = (df[column] - mu)/sigma

In [20]:
df.head()

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,-0.897674,1.028611,-1.336794,-1.308593,0
1,-1.1392,-0.12454,-1.336794,-1.308593,0
2,-1.380727,0.33672,-1.39347,-1.308593,0
3,-1.50149,0.10609,-1.280118,-1.308593,0
4,-1.018437,1.259242,-1.336794,-1.308593,0


In [21]:
from sklearn.model_selection import train_test_split

# features extraction
y = df['category']
x = df.drop(['category'], axis = 1)


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)  

# KNN class

In [22]:
class Knn:

  def __init__(self, k, distance_metric = "euclidian"):
    self.__verifications(k, distance_metric)
    self.k = k
    self.distance_metric = distance_metric

  
  def __verifications(self, k, distance_metric):
    """ Do some coherence verrifications """
    if k == 0 : ValueError("K can not be null")
    if distance_metric not in ["euclidian", "manhattan"] : ValueError("Distance metric is invalid")

  
  def fit(self, x_train, y_train):
    """ Define the training data """
    self.x_train = x_train
    self.y_train = y_train


  def predict(self, X):
    """ predict """
    if self.k > len(X):
        raise ValueError("K is greater than number of instances")

    predictions = []
    
    for i in range(len(X)):
        neighbors = self.__getNeighbors(self.x_train, self.y_train, X.iloc[i], self.k)
        result = self.__getResponse(neighbors)
        predictions.append(result)

    return predictions

  
  


  def __Distance(self, instance1, instance2):

    """ calculate distance  """

    from scipy.spatial import distance
    # the length is always N - 1
    length = df.iloc[0].size - 1
    
    # ignoring last feature
    instance1 = instance1[:length]
    instance2 = instance2[:length]

    if self.distance_metric == "euclidian":
        return distance.euclidean(instance1, instance2)
    elif self.distance_metric == "manhattan":
        return distance.cityblock(instance1, instance2)
    else :
        raise ValueError("Invalid distance metric")

  def __getNeighbors(self, x_train, y_train, inst, k):

    """ get k nearest neighbours with their categories  """

    k_nearest = {}

    for idx in range(len(x_train)):

        row = x_train.iloc[idx]
        dist = self.__Distance(inst, row)

        if len(k_nearest) < k:
            k_nearest[dist] = y_train.iloc[idx]

        elif dist < max(k_nearest.keys()):
            del k_nearest[max(k_nearest.keys())]
            k_nearest[dist] = y_train.iloc[idx]

    return k_nearest


  def __getResponse(self, neighbors):

    """ what is the max vote  """

    classVotes = {}

    for i in neighbors:
        vote = neighbors[i]
        classVotes[vote] = classVotes.get(vote, 0) + 1

    return max(classVotes, key = lambda x: classVotes[x])

  
  def getAccuracy(self, predictions, y_test):

    correct = 0
    for i in range(len(y_test)):
        if y_test.iloc[i] == predictions[i]:
            correct += 1
    return (correct/float(len(y_test))) * 100.0

    


# Main

In [23]:
max_acc = 0

# k had better to be odd
for k in range(1, 6, 2):

    print(k)
    k_c = Knn(k = k, distance_metric = "manhattan")
    k_c.fit(x_train, y_train)
    predictions = k_c.predict(x_test)
    acc = k_c.getAccuracy(predictions, y_test)

    if acc > max_acc:
        max_acc = acc
        best_k = k


print("Best result : ")
print("\t k =", best_k)
print("\t accuracy =", max_acc)

1
3
5
Best result : 
	 k = 5
	 accuracy = 100.0
