In [1]:
import numpy as np
import pandas as pd 
import math
import operator

In [2]:
#load our datafiles
data = pd.read_csv('atomsradii.csv')
testing = pd.read_csv('testing.csv')
#define training dataset and testing dataset
data_rWC = data.iloc[:,0].values
data_rCh = data.iloc[:,1].values
testing_rWC = testing.iloc[:,0].values
testing_rCh = testing.iloc[:,1].values
data_type = pd.DataFrame(data.iloc[:,3].values)
testing_type = pd.DataFrame(testing.iloc[:,3].values)
#combine rWC and rCh as an array
df_data = np.column_stack((data_rWC,data_rCh))
df_testing = np.column_stack((testing_rWC,testing_rCh))

In [3]:
def EuclideanDistance(instance1, instance2):
    """This function calculates the Euclidean distance"""
    distance = 0
    for x in range(2):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [12]:
def test_EuclideanDistance():
    instance1 = [1,1]
    instance2 = [4,5]
    #print(EuclideanDistance(instance1, instance2))
    assert EuclideanDistance(instance1, instance2) == 5, "Result of euclidean calculation is wrong"
    return

In [13]:
def SortedEuclidean(test_data,training_data):
    """This function returns a sorted list containing euclidean distances from a specific test data 
    to a row of training data, a correspond data point (defined by rWC and rCh) and its type 
    """
    distances = []
    for item in training_data:
        dist = EuclideanDistance(test_data,item)
        distances.append((dist,item))
        df_distances = pd.DataFrame(distances)
        d = pd.merge(df_distances,data_type,left_index=True,right_index=True,how='outer')
        d.columns = ['euclidean_distance','data_point','type']
        sorted_list = d.sort_values(by='euclidean_distance',ascending=True)
        sorted_list = sorted_list.reset_index(drop=True)
    return sorted_list

In [14]:
def test_SortedEulidean():
    """This function test the order of SortedEulidean function"""
    test_data = [1,1]
    training_data = df_data
    assert SortedEuclidean(test_data,training_data).iloc[0][0] < SortedEuclidean(test_data,training_data).iloc[1][0], "The sorted order is wrong"
    return

In [5]:
def GetNeighbors(test_data,training_data, k):
    """This function get the neighbors that returns k most similar neighbors from the training set 
       for a given input data (using the already defined EuclideanDistance function)
       Create a dictionary and set different types as keys to store Neighbordata in separated lists """
    TrainingData = SortedEuclidean(test_data,training_data)
    NeighborData = TrainingData.head(k)
    #initialize dictionaries and set different types as keys
    keyDict = {'PT','TM','Alk'}
    neighbors_dict = dict([(key, []) for key in keyDict])
    for i in range(k):
        data_distance = NeighborData.iloc[i,0]
        data_type = NeighborData.iloc[i,2]
        neighbors_dict[data_type].append(data_distance) 
    #add a list containing distances and counts as the value of keys
    count_PT = len(neighbors_dict['PT'])
    count_TM = len(neighbors_dict['TM'])
    count_Alk = len(neighbors_dict['Alk'])
    neighbors_dict.setdefault('PT',[]).append(count_PT)
    neighbors_dict.setdefault('TM',[]).append(count_TM)
    neighbors_dict.setdefault('Alk',[]).append(count_Alk)
    return neighbors_dict

In [15]:
def test_GetNeighbors():
    test_data = [0.51,1.12]
    training_data = df_data
    k=3
    assert GetNeighbors(test_data,training_data, k)['Alk'][-1] == 3, "Count of neighbor points is wrong"
    return

In [6]:
def ClassPrediction(test_data,training_data, k):
    """This function uses knn algorithm to predict the specific class for a given input data"""
    #Create a dictionary to extract all the counts for different types
    neighbors = GetNeighbors(test_data,training_data, k)
    count_PT = neighbors['PT'][-1]
    count_TM = neighbors['TM'][-1]
    count_Alk = neighbors['Alk'][-1]
    type_counts={'PT':count_PT,'TM':count_TM,'Alk':count_Alk}
    #Get the max value count 
    max_count = max(type_counts.values())
    result = max(type_counts,key=type_counts.get)
    return result

In [16]:
def test_ClassPrediction():
    test_data = [0.51,1.12]
    training_data = df_data
    k=5
    assert ClassPrediction(test_data,training_data, k) == 'Alk', "Class Prediction is wrong"
    return

In [20]:
def accuracy(test_dataframe,test_data,training_data,k):
    """This function calculates the accuracy for a given k value"""
    correct = 0
    for i in range(len(test_dataframe)):
        if test_dataframe.iloc[i][-1] == ClassPrediction(test_data[i],training_data, k):
            correct += 1
            accuracy =(correct/float(len(test_data))) * 100
    return repr(accuracy) + "%"

In [27]:
def test_accuracy():
    test_dataframe = testing
    test_data = df_testing
    training_data = df_data
    k = 3
    assert accuracy(test_dataframe,test_data,training_data,k) == '60.0%', "Accuracy calculated wrong"

In [37]:
def k_accuracy():
    """This function returns to a dictionary containing k value 
    and corresponding accuracy that helps the user decide on what k to use."""
    accuracy_dict={}
    for i in range(1,len(training_data)+1):
        accuracy_dict[i] = accuracy(test_dataframe,test_data,training_data,i)
    return accuracy_dict

In [38]:
def test_k_accuracy():
    assert type(k_accuracy()) == dict,"The output type is wrong"
    return

In [39]:
test_k_accuracy()