In [1]:
import os
import re

import numpy as np
import pandas as pd

## 1. Собираем все токены

In [2]:
from nltk.corpus import stopwords


filter_out_tokens = ['\n', '.', ',', '!', '?', '...', ':', ';']
stop_words = set(stopwords.words('english'))
broken_files = set()

def data_to_text(data, column='lemma'):
    return ' '.join(data[column].values)

def read_data_with_filter(filename):

    rows = []
    sentences = []
    sentence = []

    with open(filename, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if line == '\n' and len(sentence):
                sentences.append(sentence)
                sentence = []
            if line[0] in filter_out_tokens:
                continue
            try:
                token, stem, lemma = line.split('\t')
                # выкидываем ещё и цифры
                lemma_ = re.sub(r'_',' ', lemma)
                lemma_ = re.sub(r'[^\w\s]|[\d]|ca|re','', lemma_)
                token_ = re.sub(r'[^\w\d\s\d]','', token)
                if len(lemma) != len(lemma_):
                    continue
                lemma = lemma_
                if len(lemma) == 0 or len(lemma.strip()) == 0 or token in stop_words:
                    continue
            except Exception as e:
                print(filename, line, e)
                
            rows.append((token.lower(), stem.lower(),lemma.strip().lower()))
            sentence.append(lemma.strip().lower())



    return pd.DataFrame(rows, columns=('token', 'stem', 'lemma')), sentences

In [None]:
# Читаем все файлы, достаём все слова
topics = ['baseball', 'electronics', 'hockey', 'med', 'motorcycles', 'space']
data_path = '../../assets\\annotated-corpus'

corpuses = []
corpus_sentences = []

for topic in topics:
    cur_data_path = os.path.join(data_path, topic)
    files = os.listdir(cur_data_path)
    filenames = [os.path.join(cur_data_path, fname) for fname in files]
    print('number of files: ', len(filenames))
    for f in filenames:
        corpus, sentences = read_data_with_filter(f)
        corpuses.append(corpus)
        corpus_sentences.extend(sentences)

unique_words = set()

for c in corpuses:
    unique_tokens = pd.unique(c['lemma'])
    unique_words.update(unique_tokens.tolist())
print('num of unique words: ', len(unique_words))

## Задание 1

In [None]:
unique_words

In [None]:
import tqdm
def get_term_document_matrix(unique_words: set, documents: list[pd.DataFrame]):
    print(len(documents))
    result = []
    i = 0
    for word in tqdm.tqdm(unique_words):
        summ_freq = 0
        term_doc_vector = []
        for d in documents:
            curr_freq = len(d[d['lemma'] == word])
            term_doc_vector.append(curr_freq)
            summ_freq += curr_freq
        result.append([word, summ_freq, *term_doc_vector])
        i += 1
    result = pd.DataFrame(result)
    result.columns = ['word', 'freq', *[f'd{i}' for i in range(len(documents))]]
    result.set_index('word')
    return result

term_document_matrix_df = get_term_document_matrix(unique_words, corpuses[:100])
term_document_matrix_df

In [None]:
term_document_matrix_df[term_document_matrix_df['freq'] > 2]

In [84]:
term_document_matrix_df.to_csv('../../assets/term_document_matrix.csv')

## Задание 2

In [None]:
def get_document_term_vector(term_document_matrix_df, doc_name):
    doc_term_vector = term_document_matrix_df
    doc_term_vector = doc_term_vector[['word', doc_name]]
    doc_term_vector = doc_term_vector[doc_term_vector[doc_name] > 0]
    return doc_term_vector

doc_term_vec = get_document_term_vector(term_document_matrix_df, 'd90')
doc_term_vec

In [None]:
from ..tokenizer.tokenizer import tokenize_text
def read_file(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
    text = text.replace('>', '').replace('|', '').replace('\n', ' ').replace('-', '')
    return text

def annotation_2_sentence_list(doc_annotaition):

    rows = []
    sentences = []
    sentence = []

    lines = doc_annotaition.split('\n')
    for line in lines:
        if line == '\n' and len(sentence):
            sentences.append(sentence)
            sentence = []
        if line[0] in filter_out_tokens:
            continue
        try:
            token, stem, lemma = line.split('\t')
            # выкидываем ещё и цифры
            lemma_ = re.sub(r'_',' ', lemma)
            lemma_ = re.sub(r'[^\w\s]|[\d]|ca|re','', lemma_)
            token_ = re.sub(r'[^\w\d\s\d]','', token)
            if len(lemma) != len(lemma_):
                continue
            lemma = lemma_
            if len(lemma) == 0 or len(lemma.strip()) == 0 or token in stop_words:
                continue
        except Exception as e:
            print(line, e)
            
        rows.append((token.lower(), stem.lower(),lemma.strip().lower()))
        sentence.append(lemma.strip().lower())




In [207]:
import numpy as np
def tf_idf(term_document_matrix_df, documents, min_freq=5):
    filtered_term_document_matrix = term_document_matrix_df[term_document_matrix_df['freq'] > min_freq]

    total_documents = len(documents)

    # tf
    tf_idf_matrix = []
    total_words_in_documents = np.array([len(d) for d in documents])
    for i in range(len(filtered_term_document_matrix)):
        tf_ = filtered_term_document_matrix.iloc[i, 2:].values / total_words_in_documents
        idf_ = np.log(total_documents / len(tf_[tf_ > 0]))
        tf_idf_ = tf_ * idf_
        tf_idf_matrix.append([filtered_term_document_matrix.iloc[i,0], *tf_idf_])
    tf_idf_matrix = pd.DataFrame(tf_idf_matrix)
    tf_idf_matrix.columns = ['word', *[f'tfidf_score{i}' for i in range(len(documents))]]
    
    # как использовать tf-ifd как весовой коэф??
    mean_tf_idf_matrix = np.mean(tf_idf_matrix.iloc[:, 1:].values, axis=1)
    tf_idf_matrix['mean_tf_idf_score'] = mean_tf_idf_matrix

    return tf_idf_matrix

In [None]:
tf_idf_matrix = tf_idf(term_document_matrix_df, corpuses[:100])
tf_idf_matrix

In [209]:
tf_idf_matrix.to_csv('../../assets/tf_idf_matrix.csv')

## Задание 3

In [6]:
from gensim.models import Word2Vec


EPOCHS = 40
LOWEST_WORD_FREQUENCY=3
VECTOR_SIZE=128
WINDOW_SIZE = 5
WORKERS = 4


In [12]:
model = Word2Vec.load('hockey_training')

In [7]:
model = Word2Vec(
    corpus_sentences,
    min_count=LOWEST_WORD_FREQUENCY, 
    epochs=40,
    window=WINDOW_SIZE,
    vector_size=VECTOR_SIZE,
    workers=WORKERS,
)

model.save('../../assets/big_model')

In [None]:
words = list(model.wv.key_to_index.keys())
len(words), words

In [None]:
def inspect_wordfreq_2_wordnumber():
    result = []
    for word_freq in [1,2,4,8,16]:
        model_ = Word2Vec(
            corpus_sentences,
            min_count=word_freq, 
            epochs=40,
            window=WINDOW_SIZE,
            vector_size=VECTOR_SIZE,
            workers=WORKERS,
        )
        words = list(model_.wv.key_to_index.keys())
        result.append([word_freq, len(words)])
    p = pd.DataFrame(result, columns=['word freq', 'unique words'])
    p.set_index('word freq', inplace=True)
    return p

inspect_wordfreq_2_wordnumber()

In [216]:
words = list(model.wv.key_to_index.keys())
words
player_vector = model.wv.get_vector('player')
hockey_vector = model.wv.get_vector('hockey')
article_vector = model.wv.get_vector('article')

In [226]:
def cosine_distance(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return (1.0 - (dot_product / (norm_a * norm_b))) / 2

In [None]:
dist = cosine_distance(player_vector, hockey_vector)
dist

In [None]:
dist = cosine_distance(player_vector, article_vector)
dist

In [229]:
hockey_group = [
    "player", "goal", "playoff"
]

education_research_group = [
    "university", "research", "department", "computer", "science",
]

media_communication_group = [
   "news", "article", "post", "email", "comment"
]

In [None]:
def get_all_similariies(words, model: Word2Vec):
    n_words = len(words)
    result_matrix = np.empty((0, n_words))
    for w_1 in words:
        w1_vector = model.wv.get_vector(w_1)
        word_sims = []
        for w_2 in words:
            w2_vector = model.wv.get_vector(w_2)
            sim = cosine_distance(w1_vector, w2_vector)
            word_sims.append(sim)
        result_matrix = np.vstack((result_matrix, word_sims),dtype=np.float16)
    print(result_matrix.shape)
    df = pd.DataFrame(result_matrix, columns=words)
    df.index = words
    return df

get_all_similariies([*hockey_group, *education_research_group, *media_communication_group], model)


In [None]:
def plot_similar_groups(words: list[str], model: Word2Vec):
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    result = []
    word_vectors = []
    for word in words:
        word_vector = model.wv[word]
        word_vectors.append(word_vector)
    pca = PCA(n_components=2)
    pca.fit(word_vectors)
    for word in words:
        result.append([word, *pca.transform([model.wv[word]])[0]])

    result = pd.DataFrame(result)
    result.columns = ['word', 'x', 'y']
    result.set_index('word', inplace=True)
    plt.scatter(result['x'].values, result['y'].values)
    for i, txt in enumerate(result.index):
        plt.annotate(txt, (result['x'].values[i], result['y'].values[i]))
    plt.show()
    
    
plot_similar_groups([*hockey_group, *education_research_group, *media_communication_group], model)


## Задание 4

## Задание 7-8

In [10]:
tfidf_matrix = pd.read_csv('../../assets/tf_idf_matrix.csv', usecols=['word', 'mean_tf_idf_score'])

def vectorize_document(doc, model):
    document_vector = np.zeros(VECTOR_SIZE)
    for sentence in doc:
        sentence_vector = vectorize_sentence(sentence, model)
        document_vector += sentence_vector
    document_vector /= len(doc)
    return document_vector

def vectorize_sentence(sentence, model: Word2Vec):
    sentence_vector = np.zeros(VECTOR_SIZE)
    for w in sentence:
        if model.wv.has_index_for(w):
            sentence_vector += model.wv.get_vector(w)
    sentence_vector /= len(sentence)
    return sentence_vector

def get_word_weight(word):
    weight = 1
    try:
        tf_idf = tfidf_matrix[tfidf_matrix['word'] == word]['mean_tf_idf_score'].values
        if len(tf_idf) and tf_idf > 0:
            weight = 1 / tf_idf
    except:
        pass
    if weight > 0:
        return weight
    return 1

def vectorize_sentence_weighted(sentence, model: Word2Vec):
    sentence_vector = np.zeros(VECTOR_SIZE)
    for w in sentence:
        if model.wv.has_index_for(w):
            weighter_word_vector = model.wv.get_vector(w) * get_word_weight(w)
            sentence_vector += weighter_word_vector
    sentence_vector /= len(sentence)
    return sentence_vector

def get_document_annotation(doc_vector):
    return '\t'.join(list(map( str, doc_vector.tolist())))

def save_documents_annotation(path, doc_annotation):
    with open(path, 'w') as f:
        f.writelines(doc_annotation)

def vectorize_documents(filenames, model, annotation_path):
    documents_annotations = ''
    for i, fname in enumerate(filenames):
        _, doc = read_data_with_filter(fname)
        doc_vector = vectorize_document(doc, model)
        documents_annotations += f'{i+1:04}\t'
        documents_annotations += get_document_annotation(doc_vector) + '\n'

    save_documents_annotation(annotation_path, documents_annotations)

In [213]:
import os
data_path = '..\\..\\assets\\annotated-corpus\\hockey_train'
annot_path = '..\\..\\assets\\hockey_annotations_weighted_.txt'

filenames = [os.path.join(data_path, fname) for fname in os.listdir(data_path)]
vectorize_documents(filenames, model, annot_path)

In [None]:
data_path = '..\\..\\assets\\annotated-corpus\\'
annot_path = '..\\..\\assets\\vectorized\\'

model = Word2Vec.load('..\\..\\assets\\big_model')

for topic in topics[1:]:
    cur_data_path = os.path.join(data_path, topic)
    cur_annot_path = os.path.join(annot_path, topic) + '.tsv'
    files = os.listdir(cur_data_path)
    filenames = [os.path.join(cur_data_path, fname) for fname in files]
    print(f'number of files in topic `{topic}`: ', len(filenames))
    vectorize_documents(filenames, model, cur_annot_path)