## Assignments

### 1. train test split from scratch

Create a function my_train_test_split() that takes ipnput X, y and fraction of train. And ouputs the list or tuple containing splits

In [30]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import metrics

In [31]:
#Clue 1: Splitting the data sequentially for a given fraction

data = np.array([[1, 2, 0], [3, 4, 1], [5, 6, 1], [7, 8, 0], [9, 10, 1], [11, 12, 0]])
print('data:')
print(data)

# Train part of the data
split = int(0.8*data.shape[0])
X_train = data[:split, :-1]
y_train = data[:split, -1]

print('\nX_train:')
print(X_train)
print('\ny_train:')
print(y_train)

# The test part of the data
X_test = data[split:, :-1]
y_test = data[split:, -1]

print('\ny_train:')
print(X_test)

print('\ny_test:')
print(y_test)

data:
[[ 1  2  0]
 [ 3  4  1]
 [ 5  6  1]
 [ 7  8  0]
 [ 9 10  1]
 [11 12  0]]

X_train:
[[1 2]
 [3 4]
 [5 6]
 [7 8]]

y_train:
[0 1 1 0]

y_train:
[[ 9 10]
 [11 12]]

y_test:
[1 0]


In [32]:
#Clue2: splitting data randomly
data = np.array([[1, 2, 0], [3, 4, 1], [5, 6, 1], [7, 8, 0]])
num_samples = data.shape[0]
ind = np.random.choice(num_samples, num_samples, replace = False)
print(ind)
print(type(ind))
split = int(0.8*num_samples)
print(split)
ind[:split]

[2 3 0 1]
<class 'numpy.ndarray'>
3


array([2, 3, 0])

In [33]:
## TODO: Your function definition goes here
def new_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    num_samples = X.shape[0]
    num_test = int(test_size * num_samples)
    
    indices = np.random.choice(num_samples, num_samples, replace=False)
    test_indices = indices[:num_test]
    train_indices = indices[num_test:]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

In [34]:
## TODO: Your function invocation goes here
X = np.array([
    [1, 2],
    [2, 3],
    [3, 4],
    [4, 5],
    [0, 1],
    [6, 7],
    [8, 9],
    [5, 6],
])
y = np.array([0, 1, 0, 1, 0, 1, 1, 0])

X_train, X_test, y_train, y_test = new_train_test_split(X, y, test_size=0.2,random_state=10)
print("X_train:", X_train)
print(type(X_train))
print("X_test:", X_test)
print("y_train:", y_train)
print("y_test:", y_test)

X_train: [[4 5]
 [8 9]
 [5 6]
 [1 2]
 [0 1]
 [6 7]
 [2 3]]
<class 'numpy.ndarray'>
X_test: [[3 4]]
y_train: [1 1 0 0 0 1 1]
y_test: [0]


<a id='knn-scratch'></a>
<hr/>

### 2. kNN from scratch

In [35]:
# KNN class that allows setting the number of neighbours and weight=uniform or distance
class KNN:
    def __init__(self, k=5, weights='uniform'): #Fill this out
        self.n_neighbors = k
        self.weights = weights

    def fit(self,X, y): # What is missing in function definition?
        self.X_train = X
        self.y_train = y

    def predict(self,X): # What is missing in function definition?
        y_pred = []
        for x in X:
            distance = np.linalg.norm(self.X_train - x, axis=1)

            k_indices = np.argsort(distance)[:self.n_neighbors]

            if self.weights == 'uniform':
                neighbor_labels = self.y_train[k_indices]
                unique_labels, label_counts = np.unique(neighbor_labels, return_counts=True)
                most_common_label = unique_labels[np.argmax(label_counts)]
                y_pred.append(most_common_label)
            elif self.weights == 'distance':
                neighbor_labels = self.y_train[k_indices]
                weights = 1 / (distance[k_indices] + 1e-6)
                weighted_labels = dict(zip(neighbor_labels, weights))
                weighted_sum = {label: sum(weighted_labels[l] for l, w in zip(neighbor_labels, weights) if l == label) for label in set(neighbor_labels)}
                most_common_label = max(weighted_sum, key=weighted_sum.get)
                y_pred.append(most_common_label)
            else:
                raise ValueError("Invalid")

        return np.array(y_pred)

In [36]:
knn = KNN()
knn.fit(X_train, y_train)

In [37]:
y_pred = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy = {accuracy}")

Accuracy = 1.0


<a id='gridsearch-scratch'></a>
<hr/>

### 3. GridSearch from scratch

In [38]:
# Clue: Look at itertools.product() functionality in python.
# It will allow you to create Cartesian products needed for multiple hyperparam tuning
# Use it in a loop to write your custom Grid Search

In [40]:
import itertools

n_neighbors_options = [3, 5, 7]
weights_options = ['uniform', 'distance']

hyperparameter_combinations = list(itertools.product(n_neighbors_options, weights_options))

best_hyperparameters = None
best_performance = float('-inf')

def evaluate_model(k, weights):
    knn = KNN(k=k, weights=weights)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return accuracy

for hyperparameters in hyperparameter_combinations:
    n_neighbors, weights = hyperparameters
    performance = evaluate_model(n_neighbors, weights)
    
    if performance > best_performance:
        best_hyperparameters = hyperparameters
        best_performance = performance

print("Best Hyperparameters:", best_hyperparameters)
print("Best Performance:", best_performance)

Best Hyperparameters: (5, 'uniform')
Best Performance: 1.0


<a id='integrate'></a>
<hr/>

### 4. Integrate your custom code

1. Create a dataframe of iris dataset
2. Use your custom train test split function to split into train and test
3. Use your custom GridSearch on your customKNN class to identify the best k and best weight for iris dataset

In [41]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import itertools

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = new_train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
n_neighbors_options = [3, 5, 7]
weights_options = ['uniform', 'distance']

hyperparameter_combinations = list(itertools.product(n_neighbors_options, weights_options))

best_hyperparameters = None
best_performance = float('-inf')

for hyperparameters in hyperparameter_combinations:
    k, weights = hyperparameters
    knn = KNN(k=k, weights=weights)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > best_performance:
        best_hyperparameters = hyperparameters
        best_performance = accuracy

print("Best Hyperparameters:", best_hyperparameters)
print("Best Performance:", best_performance)

Best Hyperparameters: (3, 'uniform')
Best Performance: 1.0


In [43]:
best_n_neighbors, best_weights = best_hyperparameters
knn = KNN(k=best_n_neighbors, weights=best_weights)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy :", accuracy)

Accuracy : 1.0
