In [None]:
import os
import regex as re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def to_tokens(fileName):
    tsv = open(fileName).readlines()

    items = [row.replace("\n", '').split('\t')[0].lower() for row in tsv]
    items = [x for x in items if re.match(r'[^\\<\\>\\:\\.\\,\\"\\\'\\$\\(\\)\\/\\\\-]+', x)]

    stop_words = set(stopwords.words('english'))
    items = [w for w in items if not w.lower() in stop_words]
    return items

all_tokens = []
for file in [x for x in os.listdir("dicts/") if x.endswith('.tsv')]:
    all_tokens += to_tokens(f"dicts/{file}")
    
print(all_tokens[:5])


In [None]:
def to_sentences(fileName):
    tsv = open(fileName).readlines()

    items = [row.split('\t')[0].lower() for row in tsv]
    items = [x for x in items if re.match(r'[^\\<\\>\\:\\.\\,\\"\\\'\\$\\(\\)\\/\\\\-]+', x)]
    stop_words = set(stopwords.words('english'))
    
    items = [w for w in items if not w.lower() in stop_words]
    sentences = [[]]

    for item in items:
        if item == '\n':
            sentences.append([])
        else:
            sentences[-1].append(item)
    return sentences

all_sentences = []

for file in [x for x in os.listdir("dicts/") if x.endswith('.tsv')]:
    all_sentences.append(to_sentences(f"dicts/{file}"))

all_sentences = sum(all_sentences, [])

print(all_sentences[:5])

In [None]:
print('all_sentences len: ', len(all_sentences))
print('all_tokens len: ', len(all_tokens))

### Считаем частоты

In [None]:
frequency_map = {}
for token in all_tokens:
    frequency_map[token] = frequency_map.get(token, 0) + 1

### Удаляем низкочастотные токены

In [None]:
frequency_map = {key:val for key, val in frequency_map.items() if val <= 6}

### Строим матрицу Termin-Document

In [None]:
import pandas as pd
import os

termin_document = pd.DataFrame(columns=frequency_map.keys())

for folder in [x for x in os.listdir("dicts/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'dicts/{folder}/'):
        print(f"{folder}/{file}")
        termin_document.loc[file] = 0
        for token in to_tokens(f"dicts/{folder}/" + file):
            try:
                termin_document.at[f"{folder}/{file}", token] += 1
            except:
                # токена нет в укороченном словаре
                pass

In [None]:
termin_document

In [None]:
def update_frequency_map(tokens):
    for token in tokens:
        frequency_map[token] = frequency_map.get(token, 0) + 1
    #frequency_map = {key:val for key, val in frequency_map.items() if val <= 6}
    #return frequency_map

In [None]:
frequency_map = {}
def vectorize(sentences):
    #print(sentences)
    #print(sum(sentences, []))
    update_frequency_map(sum(sentences, []))
    
    freq_matrix = pd.DataFrame(columns=frequency_map.keys())
    for sentence in sentences:
        for token in sentence:
            freq_matrix.loc[token] = 0
    for sentence in sentences:
        for token in sentence:
            try:
                freq_matrix.at[token, token] += 1
            except:
                pass
    vector = []
    for token in frequency_map:
        vector.append(round(freq_matrix[token].mean(), 3))

    return vector

In [None]:
vectors = []
count = 0
for folder in [x for x in os.listdir("dicts/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'dicts/{folder}/'):
        count+=1
        if count >= 101: break
        print(f"{folder}/{file}")
        tokens = to_sentences(f'dicts/{folder}/{file}')
        vector = vectorize(tokens)
        vectors.append(vector)
print(len(vectors))

In [None]:
print(len(vectors))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca_model = pca.fit(vectors)

In [None]:
import numpy as np
def vectorize(sentences):
    freq_matrix = pd.DataFrame(columns=frequency_map.keys())
    for sentence in sentences:
        for token in sentence:
            freq_matrix.loc[token] = 0
    for sentence in sentences:
        for token in sentence:
            try:
                freq_matrix.at[token, token] += 1
            except:
                pass
    vector = []
    for token in frequency_map:
        vector.append(round(freq_matrix[token].mean(), 3))
    vector = pca_model.transform(np.array(vector).reshape(1, -1))[0]
    return vector

## W2W

In [None]:
from scipy import spatial
def cosinus(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [None]:
class Train:
    def __init__(self, all_sentences):
        self.all_sentences = all_sentences

    def __iter__(self):
        for sentence in self.all_sentences:
            yield sentence

In [None]:
print(all_sentences[:5])

In [None]:

import gensim.models
sentence_iterator = Train(all_sentences)
model = gensim.models.Word2Vec(sentences=sentence_iterator)
print('word2vec trained')
model.save('models/word2vec_model')
#print(model.wv.key_to_index)
# check word2vec
atheism = {'word': 'atheism', '1': ['evolution', 'heretic'], '2': ['humanism', 'agnostic'], '3': ['communism', 'university']}
consistent = {'word': 'consistent', '1': ['compatible', 'agree'], '2': ['accordance', 'harmony'], '3': ['reconcile', 'control']}
book = {'word': 'book', '1': ['volume', 'words'], '2': ['christian', 'churches'], '3': ['drums', 'guitar']}
themes = [atheism, consistent, book]

for index, word_dict in enumerate(themes):
    vector = []
    vect_main_word = model.wv[word_dict['word']].tolist()
    vector.append(vect_main_word)
    base_word = word_dict['word']
    result = []
    length = 2
    for i in range(length):

        similar_word = word_dict['1'][i]
        field_word = word_dict['2'][i]
        different_word = word_dict['3'][i]

        vect_similar_word = model.wv[similar_word].tolist()
        vect_field_word = model.wv[field_word].tolist()
        vect_different_word = model.wv[different_word].tolist()
        vector.extend([vect_similar_word, vect_field_word, vect_different_word])

        value = cosinus(vect_main_word, vect_similar_word)
        result.append((similar_word, value))
        value = cosinus(vect_main_word, vect_field_word)
        result.append((field_word, value))
        value = cosinus(vect_main_word, vect_different_word)
        result.append((different_word, value))

    print(base_word, sorted(result, key=lambda a: a[1], reverse=True))

In [None]:
def w2v_vectorize(fileName):
    model = gensim.models.Word2Vec.load('models/word2vec_model')
    sentence_list = to_sentences(fileName)
    vectors = []
    for sentence in sentence_list:
        for token in sentence:
            try:
                vectors.append(model.wv[token.lower()])
            except Exception as e:
                pass

    v = np.zeros(model.vector_size)
    v = (np.array([sum(x) for x in zip(*vectors)])) / v.size

    return v

In [None]:
vec_text1_freq = vectorize("test/alt.atheism/53068.tsv")
vec_text1_w2v = w2v_vectorize("test/alt.atheism/53068.tsv")
vec_text2_freq = vectorize("dicts/alt.atheism/53313.tsv")
vec_text2_w2v = w2v_vectorize("dicts/alt.atheism/53313.tsv")
vec_text3_freq = vectorize("test/rec.autos/103007.tsv")
vec_text3_w2v = w2v_vectorize("test/rec.autos/103007.tsv")
freq_similarity = cosinus(vec_text1_freq, vec_text2_freq)
word2vec_similarity = cosinus(vec_text1_w2v, vec_text2_w2v)
freq_antisimilarity = cosinus(vec_text1_freq, vec_text3_freq)
word2vec_antisimilarity = cosinus(vec_text1_w2v, vec_text3_w2v)
print(f'Similarity for frequency vectorizer: {freq_similarity}')
print(f'Similarity for word2vec vectorizer: {word2vec_similarity}')
print(f'Antisimilarity for frequency vectorizer: {freq_antisimilarity}')
print(f'Antisimilarity for word2vec vectorizer: {word2vec_antisimilarity}')


In [None]:
result = ""
count = 0
for folder in [x for x in os.listdir("test/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'test/{folder}/'):
        print(file, count)
        count += 1
        with open(f"test/{folder}/{file}") as fin:
            vector = w2v_vectorize(f"test/{folder}/{file}")
            row = ""
            for embedding in vector:
                row += '\t' + str(round(embedding, 5))
            result += (f"{folder}/{file}" + row + '\n')
with open('assets/annotated-corpus/test-embeddings.tsv', 'w') as result_file:
    result_file.write(result)

In [None]:
result = ""
count = 0
for folder in [x for x in os.listdir("train/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'train/{folder}/'):
        print(file, count)
        count += 1
        with open(f"train/{folder}/{file}") as fin:
            vector = w2v_vectorize(f"train/{folder}/{file}")
            row = ""
            for embedding in vector:
                row += '\t' + str(round(embedding, 5))
            result += (f"{folder}/{file}" + row + '\n')
with open('assets/annotated-corpus/train-embeddings.tsv', 'w') as result_file:
    result_file.write(result)