In [None]:
## Use K Nearest Neighbor to classify survival based on Fare

# Package import
import math
import random
from sklearn.neighbors import KNeighborsClassifier

if __name__ == "__main__":
    
    # Initialization
    TRAINING_FRACTION = 0.7
    data_file_name = "data/clean_train.csv"
    raw_data = [] # Dataset before dividing into train and test sets
    train_data = []
    test_data = []

    # Read data
    with open(data_file_name, 'r') as data_file_ptr:
        for in_item in data_file_ptr:
            in_item = in_item.strip()
            in_item_list = in_item.split(',')
            
            raw_data.append(in_item_list)
        data_file_ptr.close()
        
    # Seperate raw_data into train and test sets
    # Randomly select a subset (rand_sample). Those instances in the subset are appended to the train_data;
    # the others are appended to the test_data.
    random.seed(1) # Makes sure the is the same for each run (facilitates comparison)
    rand_sample = random.sample(range(1, len(raw_data)), int(TRAINING_FRACTION * (len(raw_data) - 1))) 
    for i in range(1, len(raw_data)):
        if i in rand_sample:
            train_data.append(raw_data[i])
        else:
            test_data.append(raw_data[i])
            
    # Build KNN classifiers
    train_y = [train_data[i][1] for i in range(len(train_data))] # The class value
    train_fare = [[float(train_data[i][7])] for i in range(len(train_data))] # The featuer value
    test_fare = [[float(test_data[i][7])] for i in range(len(test_data))] # The test data with only the feature value
    # Set the number of neighbors, fit and predict
    neigh = KNeighborsClassifier()
    neigh.fit(train_fare, train_y)
    res = neigh.predict(test_fare)
    
    for i in range(len(test_data)):
        test_data[i].append(res[i]) # Append the predictions to data instances to better compare them
    
    # Evaluation measurements
    print('When TRAINING_FRACTION = ' + str(TRAINING_FRACTION) + ':')
    TP = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '1') and (test_data[i][9] == '1'))])
    FP = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '0') and (test_data[i][9] == '1'))])
    FN = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '1') and (test_data[i][9] == '0'))])
    TN = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '0') and (test_data[i][9] == '0'))])
    print("True positive: " + str(round(TP, 4)))
    print("False positive: " + str(round(FP, 4)))
    print("False negative: " + str(round(FN, 4)))
    print("True negative: " + str(round(TN, 4)))  
    accuracy = (TP + TN) / len(test_data)
    sensitivity = TP / (TP + FN)
    specificity = TN / (FP + TN)
    precision = TP / (TP + FP)
    print("Accuracy of prediction: " + str(round(accuracy, 4)))
    print("Sensitivity of prediction: " + str(round(sensitivity, 4)))
    print("Specificity of prediction: " + str(round(specificity, 4)))
    print("Precision of prediction: " + str(round(precision, 4)))

When TRAINING_FRACTION = 0.7:
True positive: 50
False positive: 34
False negative: 61
True negative: 123
Accuracy of prediction: 0.6455
Sensitivity of prediction: 0.4505
Specificity of prediction: 0.7834
Precision of prediction: 0.5952


In [None]:
# More exploration into the data
# Compare uniform weight versus distance

test_data = []

# Seperate raw_data into train and test sets
# Randomly select a subset (rand_sample). Those instances in the subset are appended to the train_data;
# the others are appended to the test_data.
random.seed(1) # Makes sure the is the same for each run (facilitates comparison)
rand_sample = random.sample(range(1, len(raw_data)), int(TRAINING_FRACTION * (len(raw_data) - 1))) 
for i in range(1, len(raw_data)):
    if i in rand_sample:
        train_data.append(raw_data[i])
    else:
        test_data.append(raw_data[i])

# Weight function used in prediction
neigh = KNeighborsClassifier(weights = 'distance')
neigh.fit(train_fare, train_y)
res = neigh.predict(test_fare)

for i in range(len(test_data)):
    test_data[i].append(res[i]) # Append the predictions to data instances to better compare them
    
# Print evaluation measurements
print('When TRAINING_FRACTION = ' + str(TRAINING_FRACTION) + ':')
TP = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '1') and (test_data[i][9] == '1'))])
FP = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '0') and (test_data[i][9] == '1'))])
FN = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '1') and (test_data[i][9] == '0'))])
TN = len([test_data[i] for i in range(len(test_data)) if ((test_data[i][1] == '0') and (test_data[i][9] == '0'))])
print("True positive: " + str(round(TP, 4)))
print("False positive: " + str(round(FP, 4)))
print("False negative: " + str(round(FN, 4)))
print("True negative: " + str(round(TN, 4)))  
accuracy = (TP + TN) / len(test_data)
sensitivity = TP / (TP + FN)
specificity = TN / (FP + TN)
precision = TP / (TP + FP)
print("Accuracy of prediction: " + str(round(accuracy, 4)))
print("Sensitivity of prediction: " + str(round(sensitivity, 4)))
print("Specificity of prediction: " + str(round(specificity, 4)))
print("Precision of prediction: " + str(round(precision, 4)))

# Check what is the difference when changing the k value in either case.

When TRAINING_FRACTION = 0.7:
True positive: 50
False positive: 34
False negative: 61
True negative: 123
Accuracy of prediction: 0.6455
Sensitivity of prediction: 0.4505
Specificity of prediction: 0.7834
Precision of prediction: 0.5952
