In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve

In [None]:
cancer = load_breast_cancer()
X, t = load_breast_cancer(return_X_y=True)
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 1/5, random_state = 25) #split the set
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
def gradientDes(alpha,X_train_2class,t_tr,iterations):
    w = np.ones(31)
    m = len(X_train_2class)
    z = np.zeros(m)
    y = np.zeros(m)
    n = len(t_tr)
    newcol = np.ones(m)
    X_tr = np.insert(X_train_2class,0,newcol,axis=1)

    cost = np.zeros(iterations)
    for i in range (iterations):
        z = np.dot(X_tr,w)
        y = 1/(1+np.exp(-z))
        diff = y - t_tr
        grad = np.dot(X_tr.T,diff)
        grad = grad/m
        w = w - alpha*grad
        for j in range(m):
            cost[i] = cost[i] + (t_tr[j]*np.logaddexp(0,-z[j]) + (1-t_tr[j])*np.logaddexp(0,z[j]))
        cost[i] = cost[i]/m
    return w,cost,z,y


def PR(y1,t_tr):
    TN = 0
    TP = 0
    FP = 0
    FN = 0
    for i in range (len(y1)):
        if (y1[i] == 0 and t_tr[i] == 0):
            TN+=1
        if (y1[i] == 1 and t_tr[i] == 1):
            TP+=1
        if (y1[i] == 0 and t_tr[i] == 1):
            FN+=1
        if (y1[i] == 1 and t_tr[i] == 0):
            FP+=1
    
    miss_classification_rate = (FN+FP)/(TN+TP+FN+FP)
    precise = TP/(TP+FP)
    recall = TP/(TP+FN)
    F1_score = 2*precise*recall/(precise+recall)
    return precise,recall,miss_classification_rate,F1_score
#comput PR Curve
def plotPR(z_test):
    TP = 0
    TN = 0
    FN = 0
    FP = 0
    p_matrix = np.zeros(len(z_test))
    r_matrix = np.zeros(len(z_test))
    y_pr = np.zeros(len(z_test))
    for i in range(len(z_test)):
        for j in range (len(z_test)):
            if(z_test[i]>=z_test[j]):
                y_pr[j] = 1
        for k in range (len(z_test)):
            if (y_pr[k] == 0 and t_test_2class[k] == 0):
                TN+=1
            if (y_pr[k] == 1 and t_test_2class[k] == 1):
                TP+=1
            if (y_pr[k] ==1 and t_test_2class[k] == 0):
                FP +=1
            if (y_pr[k] == 0 and t_test_2class[k] == 1):
                FN+=1
        p = (TP)/(TP+FP)
        r = (TP)/(TP+FN)
        p_matrix[i] = p
        r_matrix[i] = r
    return p_matrix,r_matrix

def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))


In [None]:
w,cost,z,y= gradientDes(0.5,X_train,t_train,1000)
new_col = np.ones(len(X_test))
X1_test_2class = np.insert(X_test,0,new_col,axis=1)
z_test = np.dot(X1_test_2class,w)
y_test = np.zeros(len(z_test))
for i in range(len(z_test)):
    if(z_test[i]>=0):
        y_test[i] = 1

precise,recall,miss_classification_rate,F1_score = PR(y_test,t_test)
p_matrix,r_matrix = plotPR(z_test)
p_matrix = np.min(p_matrix)+(p_matrix - np.min(p_matrix))/(np.max(p_matrix)-np.min(p_matrix))*(1-0.65)
r_matrix = NormalizeData(r_matrix)
#scikit learn
clf = LogisticRegression(random_state=0,solver='liblinear',multi_class='auto').fit(X_train, t_train)
predictionLR = clf.predict(X_test_2class)
precision, recall, thresholds = precision_recall_curve(t_test, predictionLR)
plt.plot(r_matrix,p_matrix,color='b',label='mine')
plt.plot(recall,precision,color='g',label='sklearn')
plt.title("PR curve for two LR models")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()

In [None]:
#KNN
def sort(X_train,X_test):
    M = len(X_test)
    N = len(X_train)
    dist = np.zeros((M,N))
    for i in range (len(X_test)):
        for j in range (len(X_train)):
            diff = X_test[i] - X_train[j]
            dist[i,j] = np.dot(diff,diff)
    index = np.argsort(dist,axis = 1)
    return index

def kNN(K,index,X_test,t_train,t_test):
    
    result = np.zeros((len(X_test)))
    for i in range (len(X_test)):
        count0 = 0
        count1 = 0
        for j in range (K):
            if (t_train[index[i,j]] == 0):
                count0+=1
            else:
                count1+=1
        if(count0>=count1):
            result[i] = 0
        else:
            result[i] = 1

    error = kFold(5,result,t_test,0)

    return result, error


def getRMSE(X_train,X_valid,t_train,t_valid):
    Xtr = getSet(X_train)    # add a column of one
    A = np.dot(Xtr.T,Xtr)
    A1 = np.linalg.inv(A) #the inverse of A
    B = np.dot(Xtr.T,t_train)
    w =np.dot(A1,B)
    y = np.dot(Xtr,w)
    
    # prepare the validation set
    Xva = getSet(X_valid)
    y_valid = np.dot(Xva,w)
    diff_valid = np.subtract(t_valid,y_valid)
    err_valid = np.dot(diff_valid, diff_valid)/len(X_valid)
    RMSE_valid = np.sqrt(err_valid)

    return RMSE_valid


def kFold(splits, X, t, arg):
    res = 0
    kf=KFold(n_splits=splits)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        t_train, t_test = t[train_index], t[test_index]

        res = res + getRMSE(X_train,X_test,t_train,t_test)
  
    error = res/splits

    return error    


def getSet(X):
    size=X.shape
    Xtr=np.ones( (size[0],1) )
    if (X.ndim == 1):
        Xtr=np.column_stack((Xtr, X))
    else: 
        for j in range(size[1]):
            Xtr=np.column_stack((Xtr, X[:,j]))
    return Xtr


In [None]:
index = sort(X_train,X_test)

output =[]
k_error = []
for k in range(5):
    a, b = kNN(k+1,index,X_test,t_train,t_test)
    output.append(b)

a = 0

for m in range(5):

    knn_classifier=KNeighborsClassifier(m+1)
    knn_classifier.fit(X_train,t_train)
    y_predict=knn_classifier.predict(X_test)
    scores=knn_classifier.score(X_test,t_test)
    k_error.append(1-scores)

K=5
k_index = np.arange(K)+1

plt.plot(k_index,k_error,color = 'green', label = 'sklearn')
plt.plot(k_index, output, color = 'b', label = 'mine')
plt.xlabel("k")
plt.ylabel("error")
plt.title("My/sklearn kFold implementation")
plt.legend()

knn_classifier=KNeighborsClassifier(4)
knn_classifier.fit(X_train,t_train)
y_predict=knn_classifier.predict(X_test)

a, b = kNN(4,index,X_test,t_train,t_test)
precise,recall,miss_classification_rate,F1_score=PR(predictionLR,t_test_2class)

print(miss_classification_rate,F1_score)