In [4]:
from collections import Counter

import os
import numpy as np

In [5]:
# for google colab drive mounting
# from google.colab import drive
# drive.mount('/content/drive/')

In [6]:
# Google colab base path
# base_path = '/content/drive/My Drive/Colab/Processed_Dataset'
# base_path

In [7]:
# test data
D = [
    ["cricket","very","small","insect"],
    ["play","music"],
    ["play","play","cricket","football"],
    ["like","singing"]
]

Y_D = ["Biology","Music","Sports","Music"]

d = ["want","play","cricket"]

In [8]:
def hamming_distance(p1 : Counter,p2 : Counter) -> int:
    hd = 0
    for xi in (p1 | p2): # xi in (p1 U p2)
        if (p1[xi] == 0) or (p2[xi] == 0):
            # print(xi)
            # as xi in union and not in any one of it its a mismatch
            hd += 1
    return hd

In [9]:
def euclidean_distance(p1 : Counter,p2 : Counter):
    ed = 0
    for xi in (p1 | p2): # xi in (p1 U p2)
        #print(f"({p1[xi]}-{p2[xi]})**2",end='+')
        ed += (p1[xi] - p2[xi])**2
    #print()
    #return ed before sqrt: for debug
    return np.sqrt(ed)

In [10]:
def get_tf_format(document):
    X = dict()
    Wd = len(document) # Word count of data
    for w,Nw in Counter(document).items():
        # print(f"TF {w} = {Nw}/{Wd}")
        X[w] = Nw/Wd # TF = Nw/Wd 
    return X


def get_tf_idf_format(doc,docs):
    eps = 0.000001
    n_docs = len(docs) # number of documents
    
    doc_tf_idf = get_tf_format(doc)
    to_ignore  = []
    for w in doc_tf_idf:
        Cw = 0
        for doc_i in docs:
            if w in doc_i:
                Cw += 1     
        if Cw == 0:
            to_ignore.append(w)
            # new word -> ignore
            # print(f"IDF {w} => new word; ignore")
        else:
            # print(f"IDF {w} = log({n_docs}/1+{Cw})") 
            idf = eps if Cw == n_docs else np.log10(n_docs/Cw) # lecture note: np.log10(n_docs/(1 + Cw)) 
            doc_tf_idf[w] *= idf
    
    for w in to_ignore:
        doc_tf_idf.pop(w)
    
    return doc_tf_idf


# docs_tf_idf = [get_tf_idf_format(doc,D) for doc in D]
# print(docs_tf_idf)

# d_tf_idf = get_tf_idf_format(d,D)
# print(d_tf_idf)

In [11]:
def norm(a):
    n = .0
    for ai in a.values():
        n += ai*ai
    return np.sqrt(n)


# p = dict({'moaz': 1 ,'mahmud': 1})
# print(f"norm({p})",norm(p))


def cosine_similarity(p1,p2):
    xi_both = set(p1).intersection(set(p2))
    dot = .0
    for xi in xi_both:
        dot += p1[xi]*p2[xi]
    return dot/(norm(p1)*norm(p2))

In [12]:
def find_dist(D,d,dist_function):
    if dist_function == cosine_similarity:
        docs = [get_tf_idf_format(doc,D) for doc in D]
        test_doc = get_tf_idf_format(d,D)
    else:
        docs = [Counter(di) for di in D]
        test_doc  = Counter(d)
    i = 1
    for doc in docs:
        print(f"{dist_function.__name__}(t,{i})", dist_function(test_doc,doc))
        i += 1
        
        
# find_dist(D,d,hamming_distance)
# find_dist(D,d,euclidean_distance)
# find_dist(D,d,cosine_similarity)

In [13]:
def kNN_predict(X_train,Y_train,X_test,k=3,distance_function=cosine_similarity):
    neighbors_class_dist = []
    for data,cls in zip(X_train,Y_train):
        dist = distance_function(X_test,data)
        neighbors_class_dist.append((cls,dist))
    # print("unsorted",neighbors_class_dist)
    
    neighbors_class_dist.sort(
        key=lambda class_dist: class_dist[1],
        reverse=(distance_function == cosine_similarity) 
        # if dist function = cos similarity we have to sort in decending order
    )
    # print("sorted",neighbors_class_dist)
    
    kNN_class_dist = neighbors_class_dist[:k]
    # print("kNN_class_dist",kNN_class_dist)
    
    votes = dict()
    for cls,dist in kNN_class_dist:
        # unweighted voting
        votes[cls] = votes.get(cls,0) + 1
    # print(votes)
    
    max_vote_class = max(votes,key=lambda cls: votes[cls])
    #print("max_vote_class",max_vote_class)
    
    return max_vote_class


# print(D,Y_D,d)
# kNN_predict(D,Y_D,d,k=4,distance_function=hamming_distance)
# kNN_predict(D,Y_D,d,k=4,distance_function=euclidean_distance)        
# kNN_predict(D,Y_D,d,k=4,distance_function=cosine_similarity)
# kNN_predict(D,Y_D,d)

In [14]:
def performance_evaluation(X_train, Y_train, X_test , Y_test,
                           k_vals ,distance_function, X_train_tf_idf=None):
    # get the proper input format for distance function
    if distance_function == cosine_similarity:
        if X_train_tf_idf is None: 
          X_train_tf_idf = [get_tf_idf_format(doc,X_train) for doc in X_train]
        else:
          print("Already in TF-IDF form")
        X_test         = [get_tf_idf_format(doc,X_train) for doc in X_test ]
        X_train        = X_train_tf_idf
    else:
        X_train = [Counter(data) for data in X_train]
        X_test  = [Counter(data) for data in X_test ]
    # print(X_train,"\n",X_test)
    
    stat = []
    for k in k_vals:
        print(f"---k={k}---{distance_function.__name__}---")
        total,correct,cur = len(X_test),0,0
        interval = max(total//5,1)
        for doc,actual_class in zip(X_test, Y_test):
            prediction = kNN_predict(X_train, Y_train,doc,k,distance_function)
            if prediction == actual_class:
                correct += 1
            cur += 1
            if cur % interval == 0:
                print(f"Completed: {cur*100/total:.1f}%")
        print( "--------------------")
        print(f"Correct : {correct}")
        print(f"Total   : {total}")
        print(f"Accuracy: {(correct*100)/(total):.2f}%")
        print( "--------------------")
        stat.append(correct*100/total)
    return stat

In [15]:
def get_X_Y_from(file):
    with open(file, 'r',encoding='utf16') as f:
        docs = [line.split() for line in f.readlines()]
    X = [doc[:-1] for doc in docs]
    Y = [doc[-1]  for doc in docs]
    return X,Y

In [16]:
# paths
train_input_file = os.path.join(os.getcwd(),"train.in")
validation_input_file = os.path.join(os.getcwd(),"validation.in")
# Colab: train_input_file = base_path + "/train.in"
# Colab: validation_input_file = base_path + "/validation.in"

In [None]:
# get data
X_train,Y_train = get_X_Y_from(train_input_file)
print(len(X_train),len(Y_train))

X_validation,Y_validation = get_X_Y_from(validation_input_file)
print(len(X_validation),len(Y_validation))

5500 5500
2200 2200


In [None]:
# generate stat in markdown for kNN

# hyper parameters
k_vals = [1,3,5]
functions = [hamming_distance,euclidean_distance,cosine_similarity]

# create a markdown file 
with open("kNN_stat.md","w") as out:
    print("# k Nearest Neighbor (kNN)", file=out)
    print("\n## Topics", file=out)
    for topic in set(Y_train): 
        print(f"- {topic}", file=out)
        
    print(f"\n## Training Data Size\n- **{len(Y_train)}** documents", file=out)
    print(f"\n## Validation Data Size\n- **{len(Y_validation)}** documents", file=out)
    print("\n## Methodologies and k Matrix", file=out)
    
    print("|   ",end=" |", file=out)
    for k in k_vals:
        print(f" {k} ",end="|", file=out)
    print("\n|"," --- |"*(len(k_vals) + 1), file=out)
    
    for distance_function in functions:
        # performance evaluation return accuracy for each k values
        accuracy_vals = performance_evaluation(X_train,Y_train,X_validation,Y_validation, k_vals,distance_function)
        
        row = f"| {distance_function.__name__} | "
        for accuracy in accuracy_vals:
            row += f"{accuracy:.2f}% | "
        print(row, file=out)
print("DONE kNN Validation")

In [None]:
# download file from colab
# from google.colab import files
# files.download('kNN_stat.md') 

In [21]:
# test the best performing kNN
best_k = 5
best_dist_func = cosine_similarity
n_iter = 50

kNN_test_itr_accuracy = []

# As training Set is the same. Precalculate the TF-IDF for all iteration
if best_dist_func == cosine_similarity:
    print("Processing TF-IDF...")
    X_train_tf_idf = [get_tf_idf_format(doc, X_train) for doc in X_train]
    print("DONE Processing TF-IDF.")

for itr in range(n_iter):
    print(f"---Test---Iteration {itr + 1}---")
    input_file = base_path + "/" + f"test_itr_{itr}.in"
    X_test,Y_test = get_X_Y_from(input_file)
    print("Test Size", len(Y_test))
    accuracy_vals = performance_evaluation(X_train, Y_train, X_test , Y_test , 
                                           [best_k] , best_dist_func, X_train_tf_idf)
    kNN_test_itr_accuracy.append(accuracy_vals[0])
print("best_k", best_k)
print("best_dist_func",best_dist_func.__name__)
print("kNN_test_itr_accuracy = ",kNN_test_itr_accuracy)

Processing TF-IDF...
DONE Processing TF-IDF.
---Test---Iteration 1---
Test Size 110
Already in TF-IDF form
---k=5---cosine_similarity---
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 95
Total   : 110
Accuracy: 86.36%
--------------------
---Test---Iteration 2---
Test Size 110
Already in TF-IDF form
---k=5---cosine_similarity---
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 97
Total   : 110
Accuracy: 88.18%
--------------------
---Test---Iteration 3---
Test Size 110
Already in TF-IDF form
---k=5---cosine_similarity---
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 96
Total   : 110
Accuracy: 87.27%
--------------------
---Test---Iteration 4---
Test Size 110
Already in TF-IDF form
---k=5---cosine_similarity---
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Com

In [22]:
print("best_k", best_k)
print("best_dist_func",best_dist_func.__name__)
print("kNN_test_itr_accuracy = ",kNN_test_itr_accuracy)

best_k 5
best_dist_func cosine_similarity
kNN_test_itr_accuracy =  [86.36363636363636, 88.18181818181819, 87.27272727272727, 82.72727272727273, 86.36363636363636, 88.18181818181819, 87.27272727272727, 87.27272727272727, 81.81818181818181, 82.72727272727273, 81.81818181818181, 85.45454545454545, 77.27272727272727, 80.0, 83.63636363636364, 76.36363636363636, 80.9090909090909, 78.18181818181819, 75.45454545454545, 78.18181818181819, 80.0, 88.18181818181819, 86.36363636363636, 79.0909090909091, 83.63636363636364, 83.63636363636364, 82.72727272727273, 78.18181818181819, 80.9090909090909, 80.0, 84.54545454545455, 81.81818181818181, 85.45454545454545, 82.72727272727273, 82.72727272727273, 80.0, 80.9090909090909, 78.18181818181819, 88.18181818181819, 89.0909090909091, 78.18181818181819, 88.18181818181819, 80.0, 80.9090909090909, 84.54545454545455, 83.63636363636364, 82.72727272727273, 80.9090909090909, 80.9090909090909, 84.54545454545455]
