In [77]:
import numpy as np
from knn import KNN

############################################################################
# DO NOT MODIFY ABOVE CODES
############################################################################


# TODO: implement F1 score
def f1_score(real_labels, predicted_labels):
    """
    Information on F1 score - https://en.wikipedia.org/wiki/F1_score
    :param real_labels: List[int]
    :param predicted_labels: List[int]
    :return: float
    """
    assert len(real_labels) == len(predicted_labels)
    
    TP = np.sum(np.multiply(real_labels, predicted_labels))
    FP = np.sum(np.multiply(np.logical_not(real_labels), predicted_labels))
    TN = np.sum(np.multiply(np.logical_not(real_labels), np.logical_not(predicted_labels)))
    FN = np.sum(np.multiply(real_labels, np.logical_not(predicted_labels)))
    
    if ((TP+FP)==0 or (TP+FN)==0):
        F1 = 0
    else:
        Precision = TP / (TP+FP)
        Recall = TP / (TP + FN)
        if ((Recall+Precision)==0):
            F1 = 0
        else:
            F1 = 2*Precision*Recall / (Recall+Precision)
    
    return F1  
    raise NotImplementedError


class Distances:
    @staticmethod
    # TODO
    def canberra_distance(point1, point2):
        quotient = np.abs(np.asarray(point1)-np.asarray(point2))/(np.abs(np.asarray(point1))+np.abs(np.asarray(point2)))
        cd = np.sum(quotient)
        return cd
        """
        :param point1: List[float]
        :param point2: List[float]
        :return: float
        """
        raise NotImplementedError

    @staticmethod
    # TODO
    def minkowski_distance(point1, point2):
        power = np.float_power(np.abs(np.asarray(point1)-np.asarray(point2)), 3)
        total = np.sum(power)
        md = total ** (1/3)
        return md
        """
        Minkowski distance is the generalized version of Euclidean Distance
        It is also know as L-p norm (where p>=1) that you have studied in class
        For our assignment we need to take p=3
        Information on Minkowski distance - https://en.wikipedia.org/wiki/Minkowski_distance
        :param point1: List[float]
        :param point2: List[float]
        :return: float
        """
        raise NotImplementedError

    @staticmethod
    # TODO
    def euclidean_distance(point1, point2):
        sub = np.subtract(point1, point2)
        dot = np.dot(sub, sub)
        ed = np.sqrt(dot)
        return ed
        """
        :param point1: List[float]
        :param point2: List[float]
        :return: float
        """
        raise NotImplementedError

    @staticmethod
    # TODO
    def inner_product_distance(point1, point2):
        ipd=np.dot(point1, point2)
        return ipd
        """
        :param point1: List[float]
        :param point2: List[float]
        :return: float
        """
        raise NotImplementedError

    @staticmethod
    # TODO
    def cosine_similarity_distance(point1, point2):
        dot = np.dot(point1, point2)
        norm1 = np.linalg.norm(point1)
        norm2 = np.linalg.norm(point2)
        csd = 1 - (dot / (norm1*norm2))
        return csd
        """
       :param point1: List[float]
       :param point2: List[float]
       :return: float
       """
        raise NotImplementedError

    @staticmethod
    # TODO
    def gaussian_kernel_distance(point1, point2):
        sub = np.subtract(point1, point2)
        dot = np.dot(sub, sub)
        gkd = -1*np.exp(dot / (-2))
        return gkd
        """
       :param point1: List[float]
       :param point2: List[float]
       :return: float
       """
        raise NotImplementedError


class HyperparameterTuner:
    def __init__(self):
        self.best_k = None
        self.best_distance_function = None
        self.best_scaler = None
        self.best_model = None

    # TODO: find parameters with the best f1 score on validation dataset
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val):
        """
        In this part, you should try different distance function you implemented in part 1.1, and find the best k.
        Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models.

        :param distance_funcs: dictionary of distance functions you must use to calculate the distance.
            Make sure you loop over all distance functions for each data point and each k value.
            You can refer to test.py file to see the format in which these functions will be
            passed by the grading script
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val:  List[List[int]] Validation data set will be used on your KNN predict function to produce
            predicted labels and tune k and distance function.
        :param y_val: List[int] validation labels

        Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k,
        self.best_distance_function and self.best_model respectively.
        NOTE: self.best_scaler will be None

        NOTE: When there is a tie, choose model based on the following priorities:
        Then check distance function  [canberra > minkowski > euclidean > gaussian > inner_prod > cosine_dist]
        If they have same distance fuction, choose model which has a less k.
        """
        
        # You need to assign the final values to these variables
        self.best_k = None
        self.best_distance_function = None
        self.best_model = None
        
        best_f1 = 0
        for k in range(1, 30, 2):
            for name, function in distance_funcs.items():
                model = KNN(k, function)
                model.train(x_train, y_train)
                prediction = model.predict(x_val)
                f1 = f1_score(y_val, prediction)
                if (best_f1 < f1):
                    best_f1 = f1
                    self.best_k = k
                    self.best_distance_function = name
                    self.best_model = model
        return self
        
        raise NotImplementedError

    # TODO: find parameters with the best f1 score on validation dataset, with normalized data
    def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val):
        """
        This part is similar to Part 1.3 except that before passing your training and validation data to KNN model to
        tune k and disrance function, you need to create the normalized data using these two scalers to transform your
        data, both training and validation. Again, we will use f1-score to compare different models.
        Here we have 3 hyperparameters i.e. k, distance_function and scaler.

        :param distance_funcs: dictionary of distance funtions you use to calculate the distance. Make sure you
            loop over all distance function for each data point and each k value.
            You can refer to test.py file to see the format in which these functions will be
            passed by the grading script
        :param scaling_classes: dictionary of scalers you will use to normalized your data.
        Refer to test.py file to check the format.
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val: List[List[int]] validation data set you will use on your KNN predict function to produce predicted
            labels and tune your k, distance function and scaler.
        :param y_val: List[int] validation labels

        Find(tune) best k, distance_funtion, scaler and model (an instance of KNN) and assign to self.best_k,
        self.best_distance_function, self.best_scaler and self.best_model respectively

        NOTE: When there is a tie, choose model based on the following priorities:
        For normalization, [min_max_scale > normalize];
        Then check distance function  [canberra > minkowski > euclidean > gaussian > inner_prod > cosine_dist]
        If they have same distance function, choose model which has a less k.
        """
        
        # You need to assign the final values to these variables
        self.best_k = None
        self.best_distance_function = None
        self.best_scaler = None
        self.best_model = None
        
        best_f1 = 0
        for k in range(1, 30, 2):
            for distance_name, distance_func in distance_funcs.items():
                for scaling_name, scaling_class in scaling_classes.items():
                    scaler = scaling_class()
                    normalized_x_train = scaler(x_train)
                    normalized_x_val = scaler(x_val)
                    
                    model = KNN(k, function)
                    model.train(normalized_x_train, y_train)
                    prediction = model.predict(normalized_x_val)
                    f1 = f1_score(y_val, prediction)
                    if (best_f1 < f1):
                        best_f1 = f1
                        self.best_k = k
                        self.best_distance_function = distance_name
                        self.best_scaler = scaling_name
                        self.best_model = model
        return self
        
        raise NotImplementedError


class NormalizationScaler:
    def __init__(self):
        pass

    # TODO: normalize data
    def __call__(self, features):
        
        normalized_features = []
        for i in range(len(features)):
            denominator = np.sqrt(np.dot(features[i], features[i]))
            if (denominator == 0):
                normalized = len(features[i]) * [float(0)]
                normalized_features.append(normalized)
            else:
                normalized = (features[i] / denominator).tolist()
                normalized_features.append(normalized)
        return normalized_features
        """
        Normalize features for every sample

        Example
        features = [[3, 4], [1, -1], [0, 0]]
        return [[0.6, 0.8], [0.707107, -0.707107], [0, 0]]

        :param features: List[List[float]]
        :return: List[List[float]]
        """
        raise NotImplementedError


class MinMaxScaler:
    """
    Please follow this link to know more about min max scaling
    https://en.wikipedia.org/wiki/Feature_scaling
    You should keep some states inside the object.
    You can assume that the parameter of the first __call__
    will be the training set.

    Hints:
        1. Use a variable to check for first __call__ and only compute
            and store min/max in that case.

    Note:
        1. You may assume the parameters are valid when __call__
            is being called the first time (you can find min and max).

    Example:
        train_features = [[0, 10], [2, 0]]
        test_features = [[20, 1]]

        scaler1 = MinMaxScale()
        train_features_scaled = scaler1(train_features)
        # train_features_scaled should be equal to [[0, 1], [1, 0]]

        test_features_scaled = scaler1(test_features)
        # test_features_scaled should be equal to [[10, 0.1]]

        new_scaler = MinMaxScale() # creating a new scaler
        _ = new_scaler([[1, 1], [0, 0]]) # new trainfeatures
        test_features_scaled = new_scaler(test_features)
        # now test_features_scaled should be [[20, 1]]

    """

    def __init__(self):
        self.first_call = True
        self.max = []
        self.min = []
        pass

    def __call__(self, features):
        if (self.first_call = True):
            self.first_call = False
            self.max = np.max(features, axis = 0)
            self.min = np.min(features, axis = 0)
            
        difference = self.max - self.min
        
        normalized_features = []
        for i in range(len(features)):
            normalized_feature = np.divide(np.subtract(features[i], self.min), difference, dtype=float)
            normalized_features.append(normalized_feature)
            
        return normalized_features
        """
        normalize the feature vector for each sample . For example,
        if the input features = [[2, -1], [-1, 5], [0, 0]],
        the output should be [[1, 0], [0, 1], [0.333333, 0.16667]]

        :param features: List[List[float]]
        :return: List[List[float]]
        """
        raise NotImplementedError



In [None]:
import numpy as np
from collections import Counter


class KNN:
    def __init__(self, k, distance_function):
        """
        :param k: int
        :param distance_function
        """
        self.k = k
        self.distance_function = distance_function

    # TODO: save features and lable to self
    def train(self, features, labels):
        self.train_features = features
        self.train_labels = labels
        """
        In this function, features is simply training data which is a 2D list with float values.
        For example, if the data looks like the following: Student 1 with features age 25, grade 3.8 and labeled as 0,
        Student 2 with features age 22, grade 3.0 and labeled as 1, then the feature data would be
        [ [25.0, 3.8], [22.0,3.0] ] and the corresponding label would be [0,1]

        For KNN, the training process is just loading of training data. Thus, all you need to do in this function
        is create some local variable in KNN class to store this data so you can use the data in later process.
        :param features: List[List[float]]
        :param labels: List[int]
        """
        raise NotImplementedError

    # TODO: predict labels of a list of points
    def predict(self, features):
        
        predicted_labels = []
        for i in features:
            predicted_label = Counter(self.get_k_neighbors(i)).most_common()[0][0]
            predicted_labels.append(predicted_label)
            
        return predicted_labels
        """
        This function takes 2D list of test data points, similar to those from train function. Here, you need process
        every test data point, reuse the get_k_neighbours function to find the nearest k neighbours for each test
        data point, find the majority of labels for these neighbours as the predict label for that testing data point.
        Thus, you will get N predicted label for N test data point.
        This function need to return a list of predicted labels for all test data points.
        :param features: List[List[float]]
        :return: List[int]
        """
        raise NotImplementedError

    # TODO: find KNN of one point
    def get_k_neighbors(self, point):
        
        distances = []
        for i in range(len(self.train_features)):
            distance = self.distance_function(self.train_features[i], point)
            distances.append(distance)
            sorted_all_neighbors = np.argsort(distances)
            k_neighbors = sorted_all_neighbors[0 : self.k]
            
            k_neighbors_list = []
            for i in k_neighbors:
                k_neighbors_list.append(self.train_labels[i])
                
            return k_neighbors_list
        """
        This function takes one single data point and finds k-nearest neighbours in the training set.
        You already have your k value, distance function and you just stored all training data in KNN class with the
        train function. This function needs to return a list of labels of all k neighours.
        :param point: List[float]
        :return:  List[int]
        """
        raise NotImplementedError


if __name__ == '__main__':
    print(np.__version__)



In [117]:
np.divide(0,0)

  """Entry point for launching an IPython kernel.


nan

In [120]:
np.power(2.4,5)

79.62623999999998

In [126]:
a = [0,9,0,2,3,5]
a[a==0] = 1

In [127]:
a

[1, 9, 0, 2, 3, 5]

In [128]:
np.asarray(a)

array([1, 9, 0, 2, 3, 5])

In [129]:
a[a==0] = 1
a

[1, 9, 0, 2, 3, 5]

In [130]:
point1 = [1,3,2,0,5]
point2 = [2,3,4,0,1]

In [131]:
denominator = np.abs(np.asarray(point1))+np.abs(np.asarray(point2))

In [132]:
denominator

array([3, 6, 6, 0, 6])

In [135]:
for i in range(denominator.shape[0]):
    if (denominator[i] == 0):
        denominator[i] = 1

In [136]:
denominator

array([3, 6, 6, 1, 6])

In [138]:
[0]*5

[0, 0, 0, 0, 0]

In [141]:
b=[[1,2],[3,4],[0,0]]

In [140]:
if ([0] == [0,0]):
    print('yes')
else:
    print('no')

no


In [142]:
b[0]

[1, 2]

In [143]:
c = [2,3,1]

In [145]:
d = np.max(b, axis=0)

In [146]:
e = np.min(b, axis=0)

In [147]:
d-e

array([3, 4])

In [148]:
(d-e)[1]

4

In [183]:
difference

array([1, 1, 0])

In [184]:
min

array([1, 3, 0])

In [186]:
difference

array([1, 1, 0])

In [168]:
n = np.divide(np.subtract(features[0], min[0]), difference[0], dtype=float)

In [166]:
normalized_features=[]
normalized_features.append(y)

In [170]:
normalized_features

[array([0.33333333, 0.66666667]), array([0.33333333, 0.66666667])]

In [169]:
normalized_features.append(n)

In [187]:
normalized_features=[]
for i in range(len(features)):
            if (difference[0] == 0):
                normalized_feature[i] = [0]*len(features[i])
            else:
                normalized_feature[i] = np.divide(np.subtract(features[i], min[i]), difference[i], dtype=float)
            
            normalized_features.append(normalized_feature[i])

ValueError: setting an array element with a sequence.

In [155]:
range(len(features))

range(0, 3)

In [156]:
for i in range(len(features)):
    print(i)

0
1
2


In [199]:
normalized_features = []
for i in range(len(features)):
    normalized_feature = np.divide(np.subtract(features[i], min), difference, dtype=float)
    normalized_features.append(normalized_feature)

In [177]:
normalized_features

[array([0.33333333, 0.5       ]), array([1., 1.]), array([0., 0.])]

In [191]:
a = [3,3,4]

In [193]:
a[0]

3

In [195]:
min = []
max = []
for d in range(0,len(features[0])): 
    sFeatures = sorted(features, key=lambda x: x[d])
                # Get min
    min.append(sFeatures[0][d])
                # Get max 
    max.append(sFeatures[len(features)-1][d])

In [196]:
min

[0, 0]

In [197]:
max

[3, 4]

In [213]:
features = [[1,2],[3,4],[0,0],[4,7]]
max = np.max(features, axis=0)
min= np.min(features, axis=0)
difference = max-min
difference

array([4, 7])

In [214]:
normalized_features=[]
for i in range(len(features)):
    nf = []
    for j in range(len(features[i])):
        if (difference[j] == 0):
            nf.append(0)
        else:
            nf.append(np.divide(np.subtract(features[i][j], min[j]), difference[j], dtype=float))
            
    normalized_features.append(nf)

In [215]:
normalized_features

[[0.25, 0.2857142857142857],
 [0.75, 0.5714285714285714],
 [0.0, 0.0],
 [1.0, 1.0]]

In [None]:
for i in range(len(features)):
            nf = []
            for j in range(len(features[i])):
                if (difference[j] == 0):
                    nf.append(0)
                else:
                    nf.append(np.divide(np.subtract(
                        features[i][j], self.min[j]), difference[j], dtype=float))

                normalized_features.append(nf)

In [None]:
result=list()
for i in range(len(features)):
            temp=[]
            for j in range(len(features[0])):
                if(diff[j]==0):
                   # print('diff is zero')
                    temp.append(0)
                else:
                    div=(features[i][j]-self.min_a[j])/diff[j]
                    temp.append(div)
            result.append(temp)

In [217]:
from collections import Counter
Counter([0,0,0,1,1,0]).most_common()[0][0]

0

In [218]:
Counter([0,0,0,1,1,0])

Counter({0: 4, 1: 2})