In [16]:
import os
from heapq import heappush, heappop

In [17]:
def preprocess(filename):
    from nltk import word_tokenize
    from nltk.stem.porter import PorterStemmer
    from nltk.corpus import stopwords
    
    tokens = []
    with open(filename, 'r') as f_in:
        for line in f_in:
            line = line.strip()
            tokens.extend(word_tokenize(line))
            tokens = [token.lower() for token in tokens if token.isalpha()]
#     print('Tokenize: {}'.format(tokens))
    stemmer = PorterStemmer()
    singles = [stemmer.stem(t) for t in tokens]
#     print('Porter\'s result: {}'.format(singles))
    stops = set(stopwords.words('english'))
    results = ([s for s in singles if s not in stops])
    return results

In [18]:
def get_vocabulary_dict(count_list):
    import pickle
    if not os.path.isfile('dictionary.txt'):
        df = {}
        for count in count_list:
            _list = list()
            for word in count:
                if word not in df and word not in _list:
                    df[word] = 1
                    _list.append(word)
                elif word in df and word not in _list:
                    df[word] += 1
                    _list.append(word)
        with open('dictionary.txt', 'w') as f_out:
            for i, key in enumerate(sorted(df.keys()), start=1):
                f_out.write(str(i) + ' ' + key + ' ' + str(df[key]) + '\n')
        with open('dictionary.pkl', 'wb') as pkl_out:
            pickle.dump(df, pkl_out)
    with open('dictionary.pkl', 'rb') as pkl_in:
        df = pickle.load(pkl_in)
        return df
    
def tf(word, count):
    return count[word] / sum(count.values())

def n_containing(word, count_list):
    return get_vocabulary_dict(count_list)[word]

def idf(word, count_list):
    return math.log(len(count_list) / n_containing(word, count_list))

def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

def cos_sim(docx, docy):
    from scipy import spatial
    import numpy as np
    key_union = list(set(docx.keys()) | set(docy.keys()))
    result = 1 - spatial.distance.cosine([docx[k] if k in docx else 0 for k in key_union], [docy[k] if k in docy else 0 for k in key_union])
    return result

def get_int(filename):
    start, end = '/', '.'
    return int(filename[filename.find(start)+1:filename.find(end)])

def save_vector_file(filename, sorted_document):
    if not os.path.isfile('vector_files/' + filename + 'txt'):
        with open('vector_files/' + filename + '.txt', 'w') as f:
            f.write(str(len(sorted_document)) + '\n')
            f.write('t_index tf-idf' + '\n')
            for t_index, tf_idf in sorted_document:
                f.write(str(t_index) + ' ' + str(tf_idf) + '\n')


In [19]:
import glob
import math
from collections import Counter

directory = glob.glob('IRTM/*.txt')
countList = []
for d in sorted(directory, key=get_int):
    tokens = preprocess(d)
    count = Counter(tokens)
    countList.append(count)
# print(countList)
tf_documents = []
sorted_documents = []
for i, count in enumerate(countList):
    word2id = {key: index for index, key in enumerate(sorted(get_vocabulary_dict(countList).keys()), start=1)}
    scores = {word2id[word]: tfidf(word, count, countList) for word in count}
    sorted_words = sorted(scores.items(), key=lambda x: x[0])
    save_vector_file(str(i+1), sorted_words)
    tf_documents.append(scores)

In [20]:
c = cos_sim(tf_documents[0], tf_documents[1])
print(c, cos_sim(tf_documents[1], tf_documents[2]))

0.17760931780045264 0.17907753358081258


In [75]:
def build_table_and_heap(tf_documents):
    doc_num = len(tf_documents)
    cos_heap = []
    for i in range(doc_num):
        for j in range(i+1, doc_num):
            heappush(cos_heap, (cos_sim(tf_documents[i], tf_documents[j]), i, j))
    return cos_heap

In [76]:
doc_num = len(tf_documents)
merge_list = [[y] for y in range(doc_num)]
cos_heap = build_table_and_heap(tf_documents)
tree_dict = {}


In [78]:
print(doc_num)
record_cluster = [20, 13, 8]

1095


In [79]:
def linking(cos_heap, record_cluster, doc_num):
    count = 0
    tree_dict = {}
    merge_list = [[y] for y in range(doc_num)]
    while cos_heap:
        e = heappop(cos_heap)
#         print(e)
        cos = e[0]
        node_x = e[1]
        node_y = e[2]
        if node_y in merge_list[node_x]: continue
        else:
            count += 1
            print(len(merge_list[node_x]), len(merge_list[node_y]))
            print('{} not in merge_list[{}]: {}'.format(node_y, node_x, merge_list[node_x]))
            print('merge merge_list[{}]: {} into merge_list[{}]: {}'.format(node_y, merge_list[node_y], node_x, merge_list[node_x]))
            docs = list(set(merge_list[node_x]) | set(merge_list[node_y]))
            for doc in sorted(docs):
                merge_list[doc] = docs
            print('get merge_list[{}]: {}'.format(node_x, merge_list[node_x]))
            for i in record_cluster:
                if i == doc_num - count:
                    tree_dict[i] = [list(item) for item in set(tuple(sorted(row)) for row in merge_list)]
#                     print(i, tree_dict[i])
        if doc_num - count <= 8: break
    return tree_dict

In [80]:
tree_dict = linking(list(cos_heap), list(record_cluster), doc_num)


# c_heap = list(cos_heap)
# len(cos_heap)

1 1
221 not in merge_list[0]: [0]
merge merge_list[221]: [221] into merge_list[0]: [0]
get merge_list[0]: [0, 221]
2 1
422 not in merge_list[0]: [0, 221]
merge merge_list[422]: [422] into merge_list[0]: [0, 221]
get merge_list[0]: [0, 221, 422]
3 1
844 not in merge_list[0]: [0, 221, 422]
merge merge_list[844]: [844] into merge_list[0]: [0, 221, 422]
get merge_list[0]: [0, 844, 221, 422]
4 1
877 not in merge_list[0]: [0, 844, 221, 422]
merge merge_list[877]: [877] into merge_list[0]: [0, 844, 221, 422]
get merge_list[0]: [0, 877, 422, 844, 221]
5 1
959 not in merge_list[0]: [0, 877, 422, 844, 221]
merge merge_list[959]: [959] into merge_list[0]: [0, 877, 422, 844, 221]
get merge_list[0]: [0, 422, 221, 844, 877, 959]
6 1
977 not in merge_list[0]: [0, 422, 221, 844, 877, 959]
merge merge_list[977]: [977] into merge_list[0]: [0, 422, 221, 844, 877, 959]
get merge_list[0]: [0, 977, 422, 221, 844, 877, 959]
7 1
1012 not in merge_list[0]: [0, 977, 422, 221, 844, 877, 959]
merge merge_list[101

767 not in merge_list[299]: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 151, 152, 154, 155, 156, 157, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 188, 189, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 23

1 1049
1007 not in merge_list[473]: [473]
merge merge_list[1007]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 2

In [81]:
for docs in tree_dict[13]:
    print(len(docs))


1
1
1
1
1
1
1
1
1
1
1
1
1083


In [74]:
with open('20.txt', 'w') as f:
    for docs in tree_dict[20]:
        for doc in docs:
            f.write(str(doc+1) + '\n')
        f.write('\n')    

with open('13.txt', 'w') as f:
    for docs in tree_dict[13]:
        for doc in docs:
            f.write(str(doc+1) + '\n')
        f.write('\n')    

with open('8.txt', 'w') as f:
    for docs in tree_dict[8]:
        for doc in docs:
            f.write(str(doc+1) + '\n')
        f.write('\n')    
        
# print(len(tree_dict[20]))
# print(len(tree_dict[13]))
# print(len(tree_dict[8]))

20
13
8
