In [125]:
import collections
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [194]:
# data preprocessing

train_set=[]
test_set = []

with open('reviewstrain.txt') as f:
    for l in f.readlines():
        label = int(l[0])
        train_set.append((label,l[2:]))

with open('reviewstest.txt') as f:
     for l in f.readlines():
        label = int(l[0])
        comment = l[2:]
        test_set.append((label, comment))

In [220]:
def get_token_dict(X):
    token_dict = collections.defaultdict(set)
    idx = 0
    for i in range(len(X)):
        for c in X[i][1].split():
            token_dict[c].add(i)
    return token_dict

def find_knn_label(k, sentence, token_dict):
    ctr = collections.defaultdict(int)
    for token in sentence.split():
        candidates=token_dict[token]
        for cand in candidates:
            ctr[cand]+=1
    s = sorted([k for k in ctr.keys()], key=lambda x:ctr[x], reverse=True)
    k_label = s[:k]
    lcount = collections.Counter([train_set[l][0] for l in k_label])
    ke = max(lcount.items(), key=lambda x:x[1])[1]
    idx_label = [i for i,v in lcount.items() if v==ke]
    if len(idx_label)==1:
        return idx_label[0]
    else:
        return 1

def get_label(s):
    y = []
    for i in range(len(s)):
        y.append(s[i][0])
    return y

def predict(k, testset, token_dict):
    _y = []
    for i in range(len(testset)):
        label = find_knn_label(k, testset[i][1], token_dict)
        _y.append(label)
    return _y

def get_matrix(y, _y):
    tp, fn, fp, tn = 0,0,0,0
    for i in range(len(y)):
        if  y[i]:
            if _y[i]:
                tp += 1
            else:
                fn += 1
        else:
            if _y[i]:
                fp += 1
            else:
                tn += 1
    return [[tp, fn], [fp, tn]]

def cal_tp_rate(m):
    return m[0][0]/(m[0][0]+m[0][1])

def cal_fp_rate(m):
    return m[1][0]/(m[1][0]+m[1][1])

def cal_corrects(y, _y):
    n = len(y)
    corr = 0
    for i in range(n):
        if y[i] == _y[i]:
            corr += 1
    return corr

def cal_acc(y, _y):
    return cal_corrects(y, _y)/len(y)

def find_k(X, kset):
    N = len(X)
    v_size = N//5
    corrects = collections.defaultdict()
    for k in kset:
        all_corr = 0
        for i in range(0, N, v_size):
            Xt = X[:i] + X[i+v_size:]
            Xv = X[i:i+v_size]
            yt = get_label(Xt)
            yv = get_label(Xv)
            temp_token_dict = get_token_dict(Xt)
            _y = predict(k, Xv, temp_token_dict)
            all_corr += cal_corrects(yv, _y)
        corrects[k] = all_corr
        print('k=', k, 'corrects=', all_corr, 'accuracy=', all_corr/N)
    return max(corrects.items(), key=lambda x:x[1])[0]

In [196]:
all_token_dict = get_token_dict(train_set)
len(all_token_dict)

5861

In [200]:
y_test = get_label(test_set)
_y_test1 = predict(1, test_set, all_token_dict)
_y_test5 = predict(5, test_set, all_token_dict)

In [201]:
print('Line 18 of test file is:', test_set[17])
print('Predicted label it is', _y_test1[17])

Line 18 of test file is: (1, 'it leaves little doubt that kidman has become one of our best actors .\t\n')
Predicted label it is 1


In [202]:
print('Line 18 of test file is:', test_set[17])
print('Predicted label it is', _y_test5[17])

Line 18 of test file is: (1, 'it leaves little doubt that kidman has become one of our best actors .\t\n')
Predicted label it is 1


In [203]:
get_matrix(y_test, _y_test1)
get_matrix(y_test, _y_test5)

[[189, 84], [118, 109]]

[[201, 72], [123, 104]]

In [204]:
print('The accruracy of test set when k=1 is:', cal_acc(y_test, _y_test1))
print('The true positive rate of test set when k=1 is:', cal_tp_rate(get_matrix(y_test, _y_test1)))
print('The false positive rate of test set when k=1 is:', cal_fp_rate(get_matrix(y_test, _y_test1)))

The accruracy of test set when k=1 is: 0.596
The true positive rate of test set when k=1 is: 0.6923076923076923
The false positive rate of test set when k=1 is: 0.5198237885462555


In [205]:
print('The accruracy of test set when k=5 is:', cal_acc(y_test, _y_test5))
print('The true positive rate of test set when k=5 is:', cal_tp_rate(get_matrix(y_test, _y_test5)))
print('The false positive rate of test set when k=5 is:', cal_fp_rate(get_matrix(y_test, _y_test5)))

The accruracy of test set when k=5 is: 0.61
The true positive rate of test set when k=5 is: 0.7362637362637363
The false positive rate of test set when k=5 is: 0.5418502202643172


In [134]:
y_train = get_label(train_set)
y_ZR = [1]*len(y_test) if sum(y_train)>(len(y_train)/2) else [0]*len(y_test)
get_matrix(y_test, y_ZR)

[[273, 0], [227, 0]]

In [175]:
best_k = find_k(train_set, [3,7,99])

k= 3 corrects= 875 accuracy= 0.5833333333333334
k= 7 corrects= 897 accuracy= 0.598
k= 99 corrects= 827 accuracy= 0.5513333333333333


In [177]:
_y_train = predict(best_k, train_set, all_token_dict)

In [179]:
cal_acc(y_train, _y_train)

0.776

In [186]:
c_train_set = []
for i in range(1500):
    c_train_set.append((_y_train[i], train_set[i][1]))
train_set = c_train_set
_y_test7 = predict(best_k, test_set, all_token_dict)
get_matrix(y_test, _y_test7)
cal_acc(y_test, _y_test7)

[[231, 42], [162, 65]]

0.592

In [227]:
def find_knn_label_d(k, sentence, token_dict):
    ctr = collections.defaultdict(int)
    for token in sentence.split():
        candidates=token_dict[token]
        for cand in candidates:
            if cand!='.' and cand!=',':
                ctr[cand]+=1
    if len(ctr)==0:
        return 1
    s = sorted([k for k in ctr.keys()], key=lambda x:ctr[x], reverse=True)
    k_label = s[:k]
    lcount = collections.Counter([train_set[l][0] for l in k_label])
    ke = max(lcount.items(), key=lambda x:x[1])[1]
    idx_label = [i for i,v in lcount.items() if v==ke]
    if len(idx_label)==1:
        return idx_label[0]
    else:
        return 1

def predict_d(k, testset, token_dict):
    _y = []
    for i in range(len(testset)):
#         print(i)
        label = find_knn_label_d(k, testset[i][1], token_dict)
        _y.append(label)
    return _y

In [238]:
c_token_dict = get_token_dict(train_set)
del c_token_dict[',']
del c_token_dict['.']

In [239]:
_y_d1 = predict_d(1, test_set, c_token_dict)
_y_d5 = predict_d(5, test_set, c_token_dict)

In [240]:
get_matrix(y_test, _y_d1)

[[181, 92], [117, 110]]

In [241]:
print('The accruracy of test set when k=1 is:', cal_acc(y_test, _y_d1))
print('The true positive rate of test set when k=1 is:', cal_tp_rate(get_matrix(y_test, _y_d1)))
print('The false positive rate of test set when k=1 is:', cal_fp_rate(get_matrix(y_test, _y_d1)))

The accruracy of test set when k=1 is: 0.582
The true positive rate of test set when k=1 is: 0.663003663003663
The false positive rate of test set when k=1 is: 0.5154185022026432


In [242]:
get_matrix(y_test, _y_d5)

[[197, 76], [111, 116]]

In [243]:
print('The accruracy of test set when k=5 is:', cal_acc(y_test, _y_d5))
print('The true positive rate of test set when k=5 is:', cal_tp_rate(get_matrix(y_test, _y_d5)))
print('The false positive rate of test set when k=5 is:', cal_fp_rate(get_matrix(y_test, _y_d5)))

The accruracy of test set when k=5 is: 0.626
The true positive rate of test set when k=5 is: 0.7216117216117216
The false positive rate of test set when k=5 is: 0.4889867841409692
