In [None]:
# load packages

import re

import pandas as pd
import numpy as np

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from sklearn import preprocessing

import operator

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")

In [None]:
pwd

In [None]:
# read data from the pdf file

def read_pdf_file(path,format='text', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=False):
        interpreter.process_page(page)
    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    return text

In [None]:
pdf_text = read_pdf_file('/Users/Jenny/Documents/Jupyter/data.pdf')

In [None]:
# extract the data from the text data

def extract_data(pdf_text):
    # split the data by the '\n\n' into list
    split_list = pdf_text.split('\n\n')
    
    # extract the item start with number
    r = re.compile("^\d")
    split_list = list(filter(r.match, split_list))
    
    # data list 
    raw = '\n'.join(split_list).strip()
    raw = raw.split(' \n')
    
    # final dataset
    data = [d.split(',') for d in raw]
    
    return pd.DataFrame(data) 

In [None]:
data = extract_data(pdf_text)

# rename the columns
data.rename(columns={0:'r1',
                     1:'r2',
                     2:'r3',
                     3:'r4',
                     4:'label'},inplace=True)

data.head()

In [None]:
data.to_csv('data.csv',index=False)

In [None]:
data[['r1','r2','r3','r4']] = data[['r1','r2','r3','r4']].astype('float')
data.info()

# KNN Classification by euclidean_distance 

In [None]:
# build the dataset
x = data[['r1','r2','r3','r4']]
y = data.label

# utilize the sklearn to train the model
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# p represents the method to calculate the distance among points, p = 2 is euclidean_distance 
knn_classifier = KNeighborsClassifier(6, p=2)
knn_classifier.fit(x_train,y_train)
y_predict = knn_classifier.predict(x_test)

# the accuracy
sum(y_predict == y_test)/len(y_test)

In [None]:
acc = []
for i in range(2,11):
    knn_classifier = KNeighborsClassifier(i, p=2)
    knn_classifier.fit(x_train,y_train)
    y_predict = knn_classifier.predict(x_test)
    acc.append(sum(y_predict == y_test)/len(y_test))

In [None]:
acc_df = pd.DataFrame(acc,list(range(2,11))).reset_index()
acc_df.columns = ['k','acc']
acc_df.plot('k','acc')

-  from the result, choose k = 3 to maximize the accuracy by euclidean_distance

In [None]:
# write the model manually 

def knn_class(newX, dataset, labels, k):
    # sample size
    dataSetSize = dataset.shape[0] 
    
    # method to calculate the Euclidean distance
    diffMat = np.tile(newX, (dataSetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1) 
    distance = sqDistance ** 0.5

    # sort the distance
    # classify the newX as the mojority class of k nears point
    sortedDistIndicies = distance.argsort()
    classCount = {}

    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

In [None]:
# use the best k = 3
knn_classifier = KNeighborsClassifier(3, p=2)
knn_classifier.fit(x_train,y_train)
y_predict = knn_classifier.predict(x_test)

result = []
for i in range(x_test.shape[0]):
    result.append(knn_class(x_test.iloc[i], x, y, 3))
    
sim = sum(result == y_predict)/len(result)

print('the result of sklearn and model written manually are {} % same'.format(sim*100))

# Normalization

- (X-mean)/std

In [None]:
x_norm = preprocessing.scale(x)
x_train_norm,x_test_norm,y_train_norm,y_test_norm = train_test_split(x_norm,y)

In [None]:
knn_classifier_norm = KNeighborsClassifier(7, p=2)
knn_classifier_norm.fit(x_train_norm,y_train_norm)
y_predict_norm = knn_classifier.predict(x_test_norm)

# the accuracy
sum(y_predict_norm == y_test_norm)/len(y_test_norm)

# KNN Classification by manhattan_distance

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# p represents the method to calculate the distance among points, p = 1 is manhattan_distance
knn_classifier = KNeighborsClassifier(6, p=2)
knn_classifier.fit(x_train,y_train)
y_predict = knn_classifier.predict(x_test)

# the accuracy
sum(y_predict == y_test)/len(y_test)

In [None]:
acc = []
for i in range(2,11):
    knn_classifier = KNeighborsClassifier(i, p=1)
    knn_classifier.fit(x_train,y_train)
    y_predict = knn_classifier.predict(x_test)
    acc.append(sum(y_predict == y_test)/len(y_test))

In [None]:
acc_df = pd.DataFrame(acc,list(range(2,11))).reset_index()
acc_df.columns = ['k','acc']
acc_df.plot('k','acc')

-  from the result, choose k = 3 to maximize the accuracy by manhattan_distance