In [1]:
import os
import re
import pymorphy2
from stop_words import get_stop_words
from nltk.tokenize import TreebankWordTokenizer 

In [2]:
token = TreebankWordTokenizer()
morph = pymorphy2.MorphAnalyzer()
stop_words = get_stop_words('russian')

In [3]:
def normalization(text, token = token):
    wordforms = []
    text = text.lower()
    text = text.strip()
    text = re.sub('[,.!()"\';:@?№—0-9«»a-z]', '', text)
    words = token.tokenize(text)
    for word in words:
        word = re.sub('\s+', ' ', word)
        wordforms.append(word)
    return wordforms

In [4]:
def uni_words(filename, stop_words = stop_words):
    uni_words = set()
    with open ('/Users/uliamiheeva/Desktop/hw1/texts/' + filename, encoding='utf-8') as f:
        for line in f:
            words = normalization(line)
        for word in words:
            if word not in stop_words:
                uni_words.add(word)
    return uni_words

In [5]:
def lemmatization(word, morph = morph):
    lemma = morph.parse(word)[0].normal_form
    return lemma

In [6]:
d = {}

In [7]:
def inverted_index(stop_words = stop_words):
    files = os.listdir('/Users/uliamiheeva/Desktop/hw1/texts/')
    i = 1
    for filename in files:
        for word in uni_words(filename):
            lemma = lemmatization(word)
            if lemma:
                if lemma in d:
                    cur = d[lemma]
                    if i not in cur:
                        d[lemma] += [i]
                else:
                    d[lemma] = [i]
        i += 1
    return d

In [8]:
def write_file(d, stop_words = stop_words):
    with open ('invindex.csv', 'w', encoding='utf-8') as fw:
        for line in d:
            fw.write(line + ':' + ', '.join(map(str, d[line])) + '\n')
    with open ('stop_words.txt', 'w',  encoding='utf-8') as ff:
        ff.write('\n '.join(stop_words))

if __name__ == '__main__':
    d = inverted_index()
    write_file(d)