In [1]:
import re
import os, os.path
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy
from scipy.sparse import csr_matrix, save_npz, load_npz
# import pandas as pd
# import warnings

# supress warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning, module="pandas", lineno=570)

# custom tokenizer with stemmer 
class PorterTokenizer(object):
    def __init__(self):
        self.pt = PorterStemmer()
    def __call__(self, doc):
        return [self.pt.stem(t) for t in RegexpTokenizer(r'(?u)\b\w\w+\b').tokenize(doc)]

In [7]:
# read each file into separate string

art_path = r'/home/yurii/projects/mownit2/lab6/ex2/'

def read_docs(art_path):
    file_list = [name for name in os.listdir(art_path)]
    doc_list = []
    for file in file_list:
        with open(art_path + file, 'r') as myfile:
            doc_list.append(myfile.read())
    return file_list, doc_list

In [None]:
# vectorize strings using tf-idf vectorizer

def tfidf_transform(doc_list):
    tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english', tokenizer=PorterTokenizer(), smooth_idf=True)
    tfidf_trans_matrix = tfidf_vectorizer.fit_transform(doc_list)
    return tfidf_trans_matrix

In [16]:
# save info

path = r'/home/yurii/projects/mownit2/lab6/'

def save_matrix(path, tfidf_trans_matrix):
    save_npz(file=path+'tfidf_matrix', matrix=tfidf_trans_matrix)
    
def save_voc(path, voc):
    voc = list(tfidf_matrix.vocabulary_.keys())
    with open(path+'vocabulary', 'w+') as voc_file:
        for item in voc:
            voc_file.write("%s\n" % item)

def save_file_list(path, file_list):
    with open(path+'file_list', 'w+') as file_list_file:
        for item in file_list:
            file_list_file.write("%s\n" % item)
            
def save_info(path, tfidf_trans_matrix, file_list, voc):
    save_matrix(path, tfidf_trans_matrix)
    save_voc(path, voc)
    save_file_list(path, file_list)

In [18]:
# load saved info

def load_matrix(path, matrix_filename):
    tfidf_trans_matrix = load_npz(file=path+matrix_filename+'.npz')
    return tfidf_trans_matrix

def load_file_list(path, file_list_filename):
    file_list = []
    with open(path+file_list_name, 'r') as myfile:
        for line in myfile:
            file_list.append(line)
    return file_list

def load_voc(path, voc_filename):
    voc = []
    with open(voc_filename, 'r') as myfile:
        for line in myfile:
            voc.append(line)
    return voc

def load_info(path, matrix_filename, file_list_filename, voc_filename):
    tfidf_trans_matrix = load_matrix(path, matrix_filename)
    file_list = load_file_list(path, file_list_filename)
    voc = load_voc(path, voc_filename)
    return tfidf_trans_matrix, file_list, voc

In [20]:
search_str = ["duck"]

def search(search_str, n, tfidf_matrix, voc):
    tfidf_search_vectorizer = TfidfVectorizer(vocabulary=voc, stop_words='english', tokenizer=PorterTokenizer(), smooth_idf=True)
    tfidf_search_matrix = tfidf_search_vectorizer.fit_transform(search_str)
    sim_matrix = cosine_similarity(tfidf_matrix, tfidf_search_matrix)
    a = [i[0] for i in sorted(enumerate(list(sim_matrix[:,0])), key=lambda x:x[1], reverse=True)]
    res = [file_list[i] for i in a]
    return res[:n]

In [73]:
# process generated files
abs_path = r'/home/yurii/projects/mownit2/lab6/outl/AB/'
cur_path = r'/home/yurii/projects/mownit2/lab6/a/'
for filename in os.listdir(abs_path):
    with open(abs_path + filename, 'r') as myfile:
        s = myfile.read()
        p = re.compile('<doc[^>]*>([^<]*)<\/doc>')
        l = p.findall(s)
        for i in range(0,len(l)):
            p = re.compile('Notes\n|Footnotes\n|Bibliography\n|References\n|External links\n|Further reading\n')
            l[i] = p.sub(r'', l[i])
            p = re.compile('\n{3,}')
            l[i] = p.sub(r'\n', l[i])
            if l[i][0] == '\n':
                l[i] = l[i][1:]
            title = l[i].splitlines()[0]
            title = title.replace(r'/', ' ')
            with open(cur_path + title, 'w+') as myfile2:
                myfile2.write(l[i])

In [15]:
# import nltk
# nltk.download()

In [16]:
# vector = CountVectorizer(min_df=1, stop_words='english', tokenizer=PorterTokenizer())
# trans_vect = vector.fit_transform(doc_list)
# dt = pd.DataFrame(trans_vect.toarray(), index=file_list, columns=vector.get_feature_names()).head(10)

In [14]:
# vector.get_feature_names()

In [29]:
# example = [
#     'In mathematics, 1 − 2 + 3 − 4 + ··· is the infinite series whose terms are the successive positive integers, given alternating signs. Using sigma summation notation the sum of the first "m" terms of the series can be expressed as',
#     'The infinite series diverges, meaning that its sequence of partial sums, , does not tend towards any finite limit. Nonetheless, in the mid-18th century, Leonhard Euler wrote what he admitted to be a paradoxical equation:',
#     "A rigorous explanation of this equation would not arrive until much later. Starting in 1890, Ernesto Cesàro, Émile Borel and others investigated well-defined methods to assign generalized sums to divergent series—including new interpretations of Euler's attempts."
# ]

# vect1 = CountVectorizer(min_df=1, stop_words='english', tokenizer=PorterTokenizer())
# # vect2 = CountVectorizer(min_df=1, stop_words='english')
# dtm1 = vect1.fit_transform(example)
# # dtm2 = vect2.fit_transform(example)

In [32]:
# pd.DataFrame(dtm1.toarray(), index=example, columns=vect1.get_feature_names()).head(10)
# vect1.get_feature_names()

In [33]:
# pd.DataFrame(dtm2.toarray(), index=example, columns=vect2.get_feature_names()).head(10)
# vect2.get_feature_names()

In [12]:
# dict_list = [col.OrderedDict([]) for i in range(0,len(file_list))]

# def read_words(words_file):
#     return [word for line in open(words_file, 'r') for word in line.split()]

# file_name = art_path + r'0'
# wl = read_words(file_name)