In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
Name = ['Ajay', 'Mark', 'Sara', 'Zaira', 'Sachin', 'Rahul', 'Pooja', 'Smith', 'Laxmi', 'Michael']
Age = [32, 40, 16, 34, 55, 40, 21, 15, 55, 15]
Gender = [0, 0, 1, 1, 0, 0, 1, 0, 1, 0]
Sport = ['F', 'N', 'C', 'C', 'N', 'C', 'F', 'C', 'F', 'F']

In [3]:
df = pd.DataFrame({'Name': Name,
                   'Age': Age,
                   'Gender': Gender,
                   'Sport': Sport})
df

Unnamed: 0,Name,Age,Gender,Sport
0,Ajay,32,0,F
1,Mark,40,0,N
2,Sara,16,1,C
3,Zaira,34,1,C
4,Sachin,55,0,N
5,Rahul,40,0,C
6,Pooja,21,1,F
7,Smith,15,0,C
8,Laxmi,55,1,F
9,Michael,15,0,F


In [4]:
Age = 5
Gender = 1

In [5]:
df['Distance'] = np.sqrt((df['Age'] - 5)**2 + (df['Gender'] - 1)**2)
df

Unnamed: 0,Name,Age,Gender,Sport,Distance
0,Ajay,32,0,F,27.018512
1,Mark,40,0,N,35.014283
2,Sara,16,1,C,11.0
3,Zaira,34,1,C,29.0
4,Sachin,55,0,N,50.009999
5,Rahul,40,0,C,35.014283
6,Pooja,21,1,F,16.0
7,Smith,15,0,C,10.049876
8,Laxmi,55,1,F,50.0
9,Michael,15,0,F,10.049876


In [6]:
df['Weight'] = 1/(df['Distance'] * df['Distance'])
df

Unnamed: 0,Name,Age,Gender,Sport,Distance,Weight
0,Ajay,32,0,F,27.018512,0.00137
1,Mark,40,0,N,35.014283,0.000816
2,Sara,16,1,C,11.0,0.008264
3,Zaira,34,1,C,29.0,0.001189
4,Sachin,55,0,N,50.009999,0.0004
5,Rahul,40,0,C,35.014283,0.000816
6,Pooja,21,1,F,16.0,0.003906
7,Smith,15,0,C,10.049876,0.009901
8,Laxmi,55,1,F,50.0,0.0004
9,Michael,15,0,F,10.049876,0.009901


In [7]:
df.sort_values(by=['Weight'], ascending=False)

Unnamed: 0,Name,Age,Gender,Sport,Distance,Weight
7,Smith,15,0,C,10.049876,0.009901
9,Michael,15,0,F,10.049876,0.009901
2,Sara,16,1,C,11.0,0.008264
6,Pooja,21,1,F,16.0,0.003906
0,Ajay,32,0,F,27.018512,0.00137
3,Zaira,34,1,C,29.0,0.001189
1,Mark,40,0,N,35.014283,0.000816
5,Rahul,40,0,C,35.014283,0.000816
8,Laxmi,55,1,F,50.0,0.0004
4,Sachin,55,0,N,50.009999,0.0004


### KNN Algorithm

1. Calculate the Euclidean distance between test point and all training points.
2. Sort by distance (ascending order).
3. Pick the top K entries.
4. Output the label with maximum frequency.

In [7]:
def knn(data, label_name, k=1):
    """
    Accepts dataframe, columns to use to calculate distance, neighbors.
    """
    assert k%2 == 1, 'k should be an odd value.'
    
    # list of columns to use for knn algorithm
    print('Note: While entering categorical column names, please ensure they are converted into numeric.')
    columns = input('Enter appropriate column names separated by single spaces.\n').split(' ')
    
    test_data = {}
    
    #Accept the test data
    for c in columns:
        test_data[c] = float(input(f'Enter {c}\n'))
    #print(test_data)
    
    # calculate the euclidean distance
    data['Distance'] = 0
    for i in range(len(data)):
        res = 0
        for key, value in test_data.items():
            res += np.square(data.iloc[i][key] - value)
        data.iloc[i, data.columns.get_loc('Distance')] = np.sqrt(res)
    
    # Extract the top k points
    top_k_points = data.sort_values(by='Distance')[:k]
    
    # Output the label with maximum frequency
    predicted_label = top_k_points[label_name].value_counts().sort_values(ascending=False).index[0]
    
    return predicted_label

In [9]:
print(f"Predicted Label: {knn(df, 'Sport', k=7)}")

Note: While entering categorical column names, please ensure they are converted into numeric.
Enter appropriate column names separated by single spaces.
Age Gender
Enter Age
15
Enter Gender
0
Predicted Label: C


## KNN on Iris dataset

### Import the dataset

In [91]:
iris = pd.read_csv('/Users/anike/ML/Datasets/Iris.csv')
iris.drop('Id', axis=1, inplace=True)
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


Shuffling the data.

In [64]:
iris = iris.sample(frac=1).reset_index(drop=True)

Creating train-test split

In [65]:
from sklearn.model_selection import train_test_split

np.random.seed(10)

X = iris.drop('Species', axis=1)
y = iris['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### KNN from scratch

In [92]:
class KNN:

    def __init__(self):
        pass

    def predict(self, data, test_data, label_name, neighbors=3):
        """
        data: Data to used for training or similarity purpose,
        test_data: data on which function will make predictions,
        label_name: which column is the target,
        neighbors: number of neighbors to use for predictions default=3
        """
        
        assert neighbors % 2 == 1, 'neighbors cannot be an even value.'
        
        predictions = []
        data2 = data.copy()
        for i in range(len(test_data)):
            data2['Distance'] = 0
            data2['Distance'] = self.get_distance(data, test_data.iloc[i])

            # Extract the top k points
            top_k_points = data2.sort_values(by='Distance')[:neighbors]

            # Output the label with maximum frequency
            predictions.append(top_k_points[label_name].value_counts().sort_values(ascending=False).index[0])

        return predictions

    def get_distance(self, data, single_data_point):
        distance = []
        dic = dict(single_data_point)

        for i in range(len(data)):
            res = 0
            for key, value in dic.items():
                res += np.square(int(data.iloc[i][key]) - int(value))
            distance.append(np.sqrt(res))

        return distance

In [93]:
myknn = KNN()
predictions = myknn.predict(pd.concat([X_train, y_train], axis=1), X_test, 'Species', neighbors=1)

Comparing true labels with predicted labels

In [94]:
result = pd.DataFrame({'True': y_test,
                       'Predicted': predictions})
result['Incorrect'] = result['True'] == result['Predicted']
result.head()

Unnamed: 0,True,Predicted,Incorrect
87,Iris-setosa,Iris-setosa,True
111,Iris-setosa,Iris-setosa,True
10,Iris-virginica,Iris-virginica,True
91,Iris-setosa,Iris-setosa,True
49,Iris-setosa,Iris-setosa,True


#### Accuracy for `KNN` and `sklearn.neighbors.KNeighborsClassifier()`

In [95]:
result['Incorrect'].value_counts() / len(result) * 100

True     93.333333
False     6.666667
Name: Incorrect, dtype: float64

In [96]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9333333333333333

## KNN on Titanic Dataset

### Import the data

In [52]:
titanic = pd.read_csv('Titanic_cleaned.csv')
titanic.drop('PassengerId', axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family,IsAlone,Pclass_First,Pclass_Second,Pclass_Third,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,22.0,1,0,7.25,2,0,0,0,1,0,1,0,0,1,0,0,1,0,0
1,1.0,38.0,1,0,71.2833,2,0,1,0,0,1,0,1,0,0,0,0,0,1,0
2,1.0,26.0,0,0,7.925,1,1,0,0,1,1,0,0,0,1,0,1,0,0,0
3,1.0,35.0,1,0,53.1,2,0,1,0,0,1,0,0,0,1,0,0,0,1,0
4,0.0,35.0,0,0,8.05,1,1,0,0,1,0,1,0,0,1,0,0,1,0,0


Creating train-test split

In [86]:
from sklearn.model_selection import train_test_split

np.random.seed(10)

X2 = titanic.drop('Survived', axis=1)
y2 = titanic['Survived']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2)

In [87]:
%%time
myknn2 = KNN()
predictions2 = myknn2.predict(data=pd.concat([X_train2, y_train2], axis=1),
                            test_data=X_test2,
                            label_name='Survived',
                            neighbors=5)

Wall time: 9min 18s


Comparing predictions with true labels

In [88]:
result2 = pd.DataFrame({'True': y_test2,
                       'Predicted': predictions2})
result2['Incorrect'] = result2['True'] == result2['Predicted']
result2.head()

Unnamed: 0,True,Predicted,Incorrect
590,0.0,0.0,True
131,0.0,0.0,True
628,0.0,0.0,True
195,1.0,1.0,True
230,1.0,1.0,True


#### Accuracy for `KNN` and `sklearn.neighbors.KNeighborsClassifier()`

In [99]:
result2['Incorrect'].value_counts() / len(result2) * 100

True     73.184358
False    26.815642
Name: Incorrect, dtype: float64

In [100]:
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train2, y_train2)
knn2.score(X_test2, y_test2)

0.7486033519553073