In [5]:
import os
import regex as re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def to_tokens(fileName):
    tsv = open(fileName).readlines()

    items = [row.replace("\n", '').split('\t')[0].lower() for row in tsv]
    items = [x for x in items if re.match(r'[^\\<\\>\\:\\.\\,\\"\\\'\\$\\(\\)\\/\\\\-]+', x)]

    stop_words = set(stopwords.words('english'))
    items = [w for w in items if not w.lower() in stop_words]
    return items

all_tokens = []
for file in [x for x in os.listdir("dicts/") if x.endswith('.tsv')]:
    all_tokens += to_tokens(f"dicts/{file}")
    
print(all_tokens[:5])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['mathew', 'subject', 'alt', 'atheism', 'faq']


In [6]:
def to_sentences(fileName):
    tsv = open(fileName).readlines()

    items = [row.split('\t')[0].lower() for row in tsv]
    items = [x for x in items if re.match(r'[^\\<\\>\\:\\.\\,\\"\\\'\\$\\(\\)\\/\\\\-]+', x)]
    stop_words = set(stopwords.words('english'))
    
    items = [w for w in items if not w.lower() in stop_words]
    sentences = [[]]

    for item in items:
        if item == '\n':
            sentences.append([])
        else:
            sentences[-1].append(item)
    return sentences

all_sentences = []

for file in [x for x in os.listdir("dicts/") if x.endswith('.tsv')]:
    all_sentences.append(to_sentences(f"dicts/{file}"))

all_sentences = sum(all_sentences, [])

print(all_sentences[:5])

[['mathew', 'subject', 'alt'], ['atheism', 'faq', 'atheist', 'resources', 'summary', 'books', 'addresses', 'music', 'anything', 'related', 'atheism', 'keywords', 'faq', 'atheism', 'books', 'music', 'fiction', 'addresses', 'contacts', 'expires', 'thu', '29', 'apr', '1993', '11', '57', '19', 'gmt', 'distribution', 'world', 'organization', 'mantis', 'consultants', 'cambridge'], ['uk'], ['telephone', '512', 'archive-name', 'atheism', 'resources', 'alt-atheism-archive-name', 'resources', 'last-modified', '11', 'december', '1992', 'version', '1', '0', 'atheist', 'resources', 'addresses', 'atheist', 'organizations', 'usa', 'freedom', 'religion', 'foundation', 'darwin', 'fish', 'bumper', 'stickers', 'assorted', 'atheist', 'paraphernalia', 'available', 'freedom', 'religion', 'foundation', 'us'], ['write', 'ffrf', 'p', 'box', '750', 'madison', 'wi', '53701']]


In [7]:
print('all_sentences len: ', len(all_sentences))
print('all_tokens len: ', len(all_tokens))

all_sentences len:  248590
all_tokens len:  1969306


### Считаем частоты

In [33]:
frequency_map = {}
for token in all_tokens:
    frequency_map[token] = frequency_map.get(token, 0) + 1

### Удаляем низкочастотные токены

In [34]:
frequency_map = {key:val for key, val in frequency_map.items() if val <= 6}

### Строим матрицу Termin-Document

In [10]:
import pandas as pd
import os

termin_document = pd.DataFrame(columns=frequency_map.keys())

for folder in [x for x in os.listdir("dicts/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'dicts/{folder}/'):
        print(f"{folder}/{file}")
        termin_document.loc[file] = 0
        for token in to_tokens(f"dicts/{folder}/" + file):
            try:
                termin_document.at[f"{folder}/{file}", token] += 1
            except:
                # токена нет в укороченном словаре
                pass

alt.atheism/53068.tsv
alt.atheism/53257.tsv
alt.atheism/53260.tsv
alt.atheism/53261.tsv
alt.atheism/53262.tsv
alt.atheism/53265.tsv
alt.atheism/53272.tsv
alt.atheism/53276.tsv
alt.atheism/53277.tsv
alt.atheism/53278.tsv
alt.atheism/53279.tsv
alt.atheism/53280.tsv
alt.atheism/53293.tsv
alt.atheism/53294.tsv
alt.atheism/53297.tsv
alt.atheism/53302.tsv
alt.atheism/53313.tsv
alt.atheism/53315.tsv
alt.atheism/53316.tsv
alt.atheism/53317.tsv
alt.atheism/53319.tsv
alt.atheism/53320.tsv
alt.atheism/53321.tsv
alt.atheism/53322.tsv
alt.atheism/53324.tsv
alt.atheism/53325.tsv
alt.atheism/53326.tsv
alt.atheism/53327.tsv
alt.atheism/53328.tsv
alt.atheism/53329.tsv
alt.atheism/53331.tsv
alt.atheism/53332.tsv
alt.atheism/53333.tsv
alt.atheism/53335.tsv
alt.atheism/53337.tsv
alt.atheism/53338.tsv
alt.atheism/53339.tsv
alt.atheism/53340.tsv
alt.atheism/53342.tsv
alt.atheism/53343.tsv
alt.atheism/53344.tsv
alt.atheism/53346.tsv
alt.atheism/53349.tsv
alt.atheism/53350.tsv
alt.atheism/53353.tsv
alt.atheis

KeyboardInterrupt: 

In [11]:
termin_document

Unnamed: 0,alt-atheism-archive-name,paraphernalia,ffrf,53701,256-8900,moulded,postpaid,7119,91605,figmo@netcom.com,...,"sexists],",neo-fruedian,dispensing,reponsible,naturalists,anthro,usouthdakota,amh,muddle,rebuttals
53068.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53257.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53260.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53261.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53262.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68086.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68087.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68088.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68089.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
def update_frequency_map(tokens):
    for token in tokens:
        frequency_map[token] = frequency_map.get(token, 0) + 1
    #frequency_map = {key:val for key, val in frequency_map.items() if val <= 6}
    #return frequency_map

In [64]:
frequency_map = {}
def vectorize(sentences):
    #print(sentences)
    #print(sum(sentences, []))
    update_frequency_map(sum(sentences, []))
    
    freq_matrix = pd.DataFrame(columns=frequency_map.keys())
    for sentence in sentences:
        for token in sentence:
            freq_matrix.loc[token] = 0
    for sentence in sentences:
        for token in sentence:
            try:
                freq_matrix.at[token, token] += 1
            except:
                pass
    vector = []
    for token in frequency_map:
        vector.append(round(freq_matrix[token].mean(), 3))

    return vector

In [66]:
vectors = []
count = 0
for folder in [x for x in os.listdir("dicts/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'dicts/{folder}/'):
        count+=1
        if count >= 101: break
        print(f"{folder}/{file}")
        tokens = to_sentences(f'dicts/{folder}/{file}')
        vector = vectorize(tokens)
        vectors.append(vector)
print(len(vectors))

alt.atheism/53068.tsv
alt.atheism/53257.tsv
alt.atheism/53260.tsv
alt.atheism/53261.tsv
alt.atheism/53262.tsv
alt.atheism/53265.tsv
alt.atheism/53272.tsv
alt.atheism/53276.tsv
alt.atheism/53277.tsv
alt.atheism/53278.tsv
alt.atheism/53279.tsv
alt.atheism/53280.tsv
alt.atheism/53293.tsv
alt.atheism/53294.tsv
alt.atheism/53297.tsv
alt.atheism/53302.tsv
alt.atheism/53313.tsv
alt.atheism/53315.tsv
alt.atheism/53316.tsv
alt.atheism/53317.tsv
alt.atheism/53319.tsv
alt.atheism/53320.tsv
alt.atheism/53321.tsv
alt.atheism/53322.tsv
alt.atheism/53324.tsv
alt.atheism/53325.tsv
alt.atheism/53326.tsv
alt.atheism/53327.tsv
alt.atheism/53328.tsv
alt.atheism/53329.tsv
alt.atheism/53331.tsv
alt.atheism/53332.tsv
alt.atheism/53333.tsv
alt.atheism/53335.tsv
alt.atheism/53337.tsv
alt.atheism/53338.tsv
alt.atheism/53339.tsv
alt.atheism/53340.tsv
alt.atheism/53342.tsv
alt.atheism/53343.tsv
alt.atheism/53344.tsv
alt.atheism/53346.tsv
alt.atheism/53349.tsv
alt.atheism/53350.tsv
alt.atheism/53353.tsv
alt.atheis

In [67]:
print(len(vectors))

100


In [68]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca_model = pca.fit(vectors)

In [76]:
import numpy as np
def vectorize(sentences):
    freq_matrix = pd.DataFrame(columns=frequency_map.keys())
    for sentence in sentences:
        for token in sentence:
            freq_matrix.loc[token] = 0
    for sentence in sentences:
        for token in sentence:
            try:
                freq_matrix.at[token, token] += 1
            except:
                pass
    vector = []
    for token in frequency_map:
        vector.append(round(freq_matrix[token].mean(), 3))
    vector = pca_model.transform(np.array(vector).reshape(1, -1))[0]
    return vector

## W2W

In [70]:
from scipy import spatial
def cosinus(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [71]:
class Train:
    def __init__(self, all_sentences):
        self.all_sentences = all_sentences

    def __iter__(self):
        for sentence in self.all_sentences:
            yield sentence

In [55]:
print(all_sentences[:5])

[['mathew', 'subject', 'alt'], ['atheism', 'faq', 'atheist', 'resources', 'summary', 'books', 'addresses', 'music', 'anything', 'related', 'atheism', 'keywords', 'faq', 'atheism', 'books', 'music', 'fiction', 'addresses', 'contacts', 'expires', 'thu', '29', 'apr', '1993', '11', '57', '19', 'gmt', 'distribution', 'world', 'organization', 'mantis', 'consultants', 'cambridge'], ['uk'], ['telephone', '512', 'archive-name', 'atheism', 'resources', 'alt-atheism-archive-name', 'resources', 'last-modified', '11', 'december', '1992', 'version', '1', '0', 'atheist', 'resources', 'addresses', 'atheist', 'organizations', 'usa', 'freedom', 'religion', 'foundation', 'darwin', 'fish', 'bumper', 'stickers', 'assorted', 'atheist', 'paraphernalia', 'available', 'freedom', 'religion', 'foundation', 'us'], ['write', 'ffrf', 'p', 'box', '750', 'madison', 'wi', '53701']]


In [61]:

import gensim.models
sentence_iterator = Train(all_sentences)
model = gensim.models.Word2Vec(sentences=sentence_iterator)
print('word2vec trained')
model.save('models/word2vec_model')
#print(model.wv.key_to_index)
# check word2vec
atheism = {'word': 'atheism', '1': ['evolution', 'heretic'], '2': ['humanism', 'agnostic'], '3': ['communism', 'university']}
consistent = {'word': 'consistent', '1': ['compatible', 'agree'], '2': ['accordance', 'harmony'], '3': ['reconcile', 'control']}
book = {'word': 'book', '1': ['volume', 'words'], '2': ['christian', 'churches'], '3': ['drums', 'guitar']}
themes = [atheism, consistent, book]

for index, word_dict in enumerate(themes):
    vector = []
    vect_main_word = model.wv[word_dict['word']].tolist()
    vector.append(vect_main_word)
    base_word = word_dict['word']
    result = []
    length = 2
    for i in range(length):

        similar_word = word_dict['1'][i]
        field_word = word_dict['2'][i]
        different_word = word_dict['3'][i]

        vect_similar_word = model.wv[similar_word].tolist()
        vect_field_word = model.wv[field_word].tolist()
        vect_different_word = model.wv[different_word].tolist()
        vector.extend([vect_similar_word, vect_field_word, vect_different_word])

        value = cosinus(vect_main_word, vect_similar_word)
        result.append((similar_word, value))
        value = cosinus(vect_main_word, vect_field_word)
        result.append((field_word, value))
        value = cosinus(vect_main_word, vect_different_word)
        result.append((different_word, value))

    print(base_word, sorted(result, key=lambda a: a[1], reverse=True))

word2vec trained
atheism [('agnostic', 0.8093706858276998), ('communism', 0.669348926320984), ('heretic', 0.6527343241359057), ('evolution', 0.6341232807459849), ('humanism', 0.42647723746018484), ('university', 0.11553240689522437)]
consistent [('accordance', 0.8807823692949056), ('harmony', 0.8368991100808967), ('reconcile', 0.801103698647636), ('agree', 0.7327112887804403), ('compatible', 0.6743904660857369), ('control', 0.6662102302895054)]
book [('words', 0.5200076367867695), ('christian', 0.4558829184907274), ('churches', 0.4102790955611755), ('guitar', 0.34240039436476954), ('drums', 0.27926897557609853), ('volume', 0.11791856253972344)]


In [72]:
def w2v_vectorize(fileName):
    model = gensim.models.Word2Vec.load('models/word2vec_model')
    sentence_list = to_sentences(fileName)
    vectors = []
    for sentence in sentence_list:
        for token in sentence:
            try:
                vectors.append(model.wv[token.lower()])
            except Exception as e:
                pass

    v = np.zeros(model.vector_size)
    v = (np.array([sum(x) for x in zip(*vectors)])) / v.size

    return v

In [92]:
vec_text1_freq = vectorize("test/alt.atheism/53068.tsv")
vec_text1_w2v = w2v_vectorize("test/alt.atheism/53068.tsv")
vec_text2_freq = vectorize("dicts/alt.atheism/53313.tsv")
vec_text2_w2v = w2v_vectorize("dicts/alt.atheism/53313.tsv")
vec_text3_freq = vectorize("test/rec.autos/103007.tsv")
vec_text3_w2v = w2v_vectorize("test/rec.autos/103007.tsv")
freq_similarity = cosinus(vec_text1_freq, vec_text2_freq)
word2vec_similarity = cosinus(vec_text1_w2v, vec_text2_w2v)
freq_antisimilarity = cosinus(vec_text1_freq, vec_text3_freq)
word2vec_antisimilarity = cosinus(vec_text1_w2v, vec_text3_w2v)
print(f'Similarity for frequency vectorizer: {freq_similarity}')
print(f'Similarity for word2vec vectorizer: {word2vec_similarity}')
print(f'Antisimilarity for frequency vectorizer: {freq_antisimilarity}')
print(f'Antisimilarity for word2vec vectorizer: {word2vec_antisimilarity}')


Similarity for frequency vectorizer: 0.8928649804804247
Similarity for word2vec vectorizer: 0.9246327364965707
Antisimilarity for frequency vectorizer: 0.7874033638420531
Antisimilarity for word2vec vectorizer: 0.7497262814745476


In [98]:
result = ""
count = 0
for folder in [x for x in os.listdir("test/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'test/{folder}/'):
        print(file, count)
        count += 1
        with open(f"test/{folder}/{file}") as fin:
            vector = w2v_vectorize(f"test/{folder}/{file}")
            row = ""
            for embedding in vector:
                row += '\t' + str(round(embedding, 5))
            result += (f"{folder}/{file}" + row + '\n')
with open('assets/annotated-corpus/test-embeddings.tsv', 'w') as result_file:
    result_file.write(result)

53068.tsv 0
53257.tsv 1
53260.tsv 2
53261.tsv 3
53262.tsv 4
53265.tsv 5
53272.tsv 6
53276.tsv 7
53277.tsv 8
53278.tsv 9
53279.tsv 10
53280.tsv 11
53293.tsv 12
53294.tsv 13
53297.tsv 14
53302.tsv 15
53313.tsv 16
53315.tsv 17
53316.tsv 18
53317.tsv 19
53319.tsv 20
53320.tsv 21
53321.tsv 22
53322.tsv 23
53324.tsv 24
53325.tsv 25
53326.tsv 26
53327.tsv 27
53328.tsv 28
53329.tsv 29
53331.tsv 30
53332.tsv 31
53333.tsv 32
53335.tsv 33
53337.tsv 34
53338.tsv 35
53339.tsv 36
53340.tsv 37
53342.tsv 38
53343.tsv 39
53344.tsv 40
53346.tsv 41
53349.tsv 42
53350.tsv 43
53353.tsv 44
53355.tsv 45
53356.tsv 46
53357.tsv 47
53358.tsv 48
53359.tsv 49
53361.tsv 50
53362.tsv 51
53363.tsv 52
53364.tsv 53
53365.tsv 54
53393.tsv 55
53394.tsv 56
53396.tsv 57
53397.tsv 58
53398.tsv 59
53399.tsv 60
53400.tsv 61
53401.tsv 62
53402.tsv 63
53403.tsv 64
53404.tsv 65
53405.tsv 66
53406.tsv 67
53408.tsv 68
53409.tsv 69
53410.tsv 70
53411.tsv 71
53412.tsv 72
53413.tsv 73
53414.tsv 74
53415.tsv 75
53416.tsv 76
53417.tsv

In [100]:
result = ""
count = 0
for folder in [x for x in os.listdir("train/") if not x.endswith('.tsv')]:
    for file in os.listdir(f'train/{folder}/'):
        print(file, count)
        count += 1
        with open(f"train/{folder}/{file}") as fin:
            vector = w2v_vectorize(f"train/{folder}/{file}")
            row = ""
            for embedding in vector:
                row += '\t' + str(round(embedding, 5))
            result += (f"{folder}/{file}" + row + '\n')
with open('assets/annotated-corpus/train-embeddings.tsv', 'w') as result_file:
    result_file.write(result)

49960.tsv 0
51060.tsv 1
51119.tsv 2
51120.tsv 3
51121.tsv 4
51122.tsv 5
51123.tsv 6
51124.tsv 7
51125.tsv 8
51126.tsv 9
51127.tsv 10
51128.tsv 11
51130.tsv 12
51131.tsv 13
51132.tsv 14
51133.tsv 15
51134.tsv 16
51135.tsv 17
51136.tsv 18
51139.tsv 19
51140.tsv 20
51141.tsv 21
51142.tsv 22
51143.tsv 23
51144.tsv 24
51145.tsv 25
51146.tsv 26
51147.tsv 27
51148.tsv 28
51149.tsv 29
51150.tsv 30
51151.tsv 31
51152.tsv 32
51153.tsv 33
51154.tsv 34
51155.tsv 35
51156.tsv 36
51157.tsv 37
51158.tsv 38
51159.tsv 39
51160.tsv 40
51161.tsv 41
51162.tsv 42
51163.tsv 43
51164.tsv 44
51165.tsv 45
51169.tsv 46
51170.tsv 47
51171.tsv 48
51172.tsv 49
51173.tsv 50
51174.tsv 51
51175.tsv 52
51176.tsv 53
51177.tsv 54
51178.tsv 55
51179.tsv 56
51180.tsv 57
51181.tsv 58
51182.tsv 59
51183.tsv 60
51184.tsv 61
51185.tsv 62
51186.tsv 63
51187.tsv 64
51188.tsv 65
51189.tsv 66
51190.tsv 67
51191.tsv 68
51192.tsv 69
51193.tsv 70
51194.tsv 71
51195.tsv 72
51196.tsv 73
51197.tsv 74
51198.tsv 75
51199.tsv 76
51200.tsv