In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## **Question 1**

In [None]:
data = pd.read_table('D2z.txt', sep=" ", header=None, names=["x1", "x2", "y"])

X, y = data.drop([data.columns[-1]], axis=1), data[data.columns[-1]]

testdata = {'x1': [], 'x2': [], 'y': []}
feature_1 = np.arange(-2.0, 2.1, 0.1)
feature_2 = np.arange(-2.0, 2.1, 0.1)
for i in feature_1:
    for j in feature_2:
        testdata['x1'].append(i)
        testdata['x2'].append(j)
        testdata['y'].append(-1)

test = pd.DataFrame(testdata)
X_test = test.drop([test.columns[-1]], axis=1)
for i in X_test.index:
    distances = []
    for j in X.index:
        distances.append(np.linalg.norm(X_test.iloc[i] - X.iloc[j]))
    min_index = distances.index(min(distances))
    test.at[i, "y"] = data.iloc[min_index]["y"]
    

plt.scatter(test['x1'], test['x2'], c=['r' if i == 1 else 'y' for i in test['y']]  ,alpha=0.2)
plt.scatter(data['x1'], data['x2'], c=['r' if i == 1 else 'y' for i in data['y']], marker='x')
plt.show()


## **Question 2**

In [None]:
def knn(train,test,fold):
  X_test, y_test = test.drop([test.columns[-1]], axis=1), test[test.columns[-1]]
  X_train, y_train = train.drop([train.columns[-1]], axis=1), train[train.columns[-1]]
  X_train = X_train.iloc[:,1:].to_numpy()
  y_train = y_train.to_numpy()
  X_test = X_test.iloc[:,1:].to_numpy()
  y_test = y_test.to_numpy()
  y_hat=[]
  train = train.to_numpy
  for test in X_test:
      distances = [np.linalg.norm(test - x) for x in X_train]
      min_index = distances.index(min(distances))
      y_hat.append(y_train[min_index])
  precision = precision_score(y_test, y_hat)
  recall = recall_score(y_test, y_hat)
  accuracy = accuracy_score(y_test, y_hat)
  return precision,accuracy,recall

data = pd.read_csv('emails.csv',sep=',')
for i in range(0,5):
  test = data.iloc[i*1000:i*1000 + 1000, :]
  train = data.drop(test.index)
  precision,accuracy,recall = knn(train,test,i*1000)
  print("Fold ",i+1)
  print("Accuracy: ",accuracy)
  print("Precision: ",precision)
  print("Recall: ",recall)
  print("---------------------------------")

## **Question 3**

In [None]:
class LogisticRegression():
    def __init__(self, alpha, iterations):
        self.alpha = alpha
        self.iterations = iterations

    def fit(self, X, Y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.Y = Y
        for i in range(self.iterations):
            self.weights_update()
        return self

    def weights_update(self):
        A = 1 / (1 + np.exp(- (self.X.dot(self.W) + self.b)))

        temp = (A - self.Y.T)
        temp = np.reshape(temp, self.m)
        dW = np.dot(self.X.T, temp) / self.m
        db = np.sum(temp) / self.m

        self.W = self.W - self.alpha * dW
        self.b = self.b - self.alpha * db

        return self

    def predict(self, X):
        P = 1 / (1 + np.exp(- (X.dot(self.W) + self.b)))
        Y = np.where(P > 0.5, 1, 0)
        return Y



data = pd.read_csv('emails.csv', sep=',')
for i in range(0, 5):
    test = data.iloc[i*1000:i*1000 + 1000, :]
    train = data.drop(test.index)

    X_train = train.iloc[:, 1:-1].values
    X_test = test.iloc[:, 1:-1].values
    y_train = train.iloc[:, -1:].values
    y_test = test.iloc[:, -1:].values.ravel()

    clf = LogisticRegression(alpha=0.01, iterations=3000)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    precision = precision_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    accuracy = accuracy_score(y_test, y_hat)

    print("Fold ",i)
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("---------------------------------")

## **Question 4**

In [None]:
def knn(train,test,fold,k):
  X_test, y_test = test.drop([test.columns[-1]], axis=1), test[test.columns[-1]]
  X_train, y_train = train.drop([train.columns[-1]], axis=1), train[train.columns[-1]]
  X_train = X_train.iloc[:,1:].to_numpy()
  y_train = y_train.to_numpy()
  X_test = X_test.iloc[:,1:].to_numpy()
  y_test = y_test.to_numpy()
  y_hat=[]
  train = train.to_numpy()
  for test in X_test:
      distances = [np.linalg.norm(test - x) for x in X_train]
      idx = np.argpartition(distances, k-1)[:k]
      counter = Counter(y_train[idx])
      y_hat.append(counter.most_common()[0][0])
      
      
  precision = precision_score(y_test, y_hat)
  recall = recall_score(y_test, y_hat)
  accuracy = accuracy_score(y_test, y_hat)
  return precision,accuracy,recall
           

data = pd.read_csv('emails.csv',sep=',')

K = [1,3,5,7]
Accuracy = []
for k in K:
    accuracy_array = []
    for i in range(0,5):
        test = data.iloc[i*1000:i*1000 + 1000, :]
        train = data.drop(test.index)
        accuracy = knn(train,test,i*1000,k)
        accuracy_array.append(accuracy)
    Accuracy.append(np.average(accuracy_array))

plt.title("kNN 5-fold cross validation")
plt.xlabel("k")
plt.xlabel("Accuracy")
plt.plot(K,Accuracy)
plt.show()

## **Question 5**

In [None]:
def knn(X_test,X_train,y_test_y_train, test, threshold,k=5):
    X_train = X_train.iloc[:,1:].to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.iloc[:,1:].to_numpy()
    y_test = y_test.to_numpy()
    y_hat=[]
    train = train.to_numpy()
    for test in X_test:
        distances = [np.linalg.norm(test - x) for x in X_train]
        idx = np.argpartition(distances, k-1)[:k]
        counter = Counter(y_train[idx])
        fraction = 0
        if counter.get(1) != None:
            fraction = (float)(counter.get(1)) / k
        if (fraction >= threshold):
            y_hat.append(1)
        else:
            y_hat.append(0)
    tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (tn + fp)
    return tpr,fpr

class LogisticRegression():
    def __init__(self, alpha, iterations):
        self.alpha = alpha
        self.iterations = iterations

    def fit(self, X, Y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.Y = Y
        for i in range(self.iterations):
            self.weights_update()
        return self

    def weights_update(self):
        A = 1 / (1 + np.exp(- (self.X.dot(self.W) + self.b)))

        temp = (A - self.Y.T)
        temp = np.reshape(temp, self.m)
        dW = np.dot(self.X.T, temp) / self.m
        db = np.sum(temp) / self.m

        self.W = self.W - self.alpha * dW
        self.b = self.b - self.alpha * db

        return self

    def predict(self, X, threshold):
        P = 1 / (1 + np.exp(- (X.dot(self.W) + self.b)))
        Y = np.where(P >= threshold, 1, 0)
        return Y


def main():
  #data
    data = pd.read_csv('emails.csv', sep=',')
    test = data.iloc[4000:5000, :]
    train = data.drop(test.index)

    X_train = train.iloc[:, 1:-1].values
    X_test = test.iloc[:, 1:-1].values
    y_train = train.iloc[:, -1:].values
    y_test = test.iloc[:, -1:].values.ravel()
  #Logistic regression
    clf = LogisticRegression(alpha=0.01, iterations=3000)

    clf.fit(X_train, y_train)
    logistic_tpr = []
    logistic_fpr = []
    for threshold in np.arange(0, 1.1, 0.1):
        y_hat = clf.predict(X_test, threshold)
        tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
        logistic_tpr.append(tp / (tp + fn))
        logistic_fpr.append(fp / (tn + fp))
  #KNN
    knn_tpr = []
    knn_fpr = []
    for threshold in np.arange(0, 1.1, 0.1):
        tpr,fpr = knn( X_test,X_train,y_test,y_train, threshold)
        knn_tpr.append(tpr)
        knn_fpr.append(fpr)
  #Plot
    plt.plot(knn_fpr, knn_tpr, color='blue')
    plt.plotlogistic_fpr, logistic_tpr, color='orange')

    plt.xlabel("False Positive Rate (Positive label: 1)")
    plt.ylabel("False Positive Rate (Positive label: 1)")
    plt.legend(['KNeighborsClassifier', 'LogisticRegression'], loc=4)
    plt.grid()
    plt.show()
main()