In [137]:
dataset_path = "../../nechkasova-tokenizer/assets/annotated-corpus/train"

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [139]:
import re
import string

def read_data(file_path):
    with open(file_path, encoding="ISO-8859-1") as file:
        lines = file.readlines()
    return lines

def clean_text(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    return text.lower()

In [140]:
def process_file(file_path):
    lines = read_data(file_path)
    tokens = []

    for line in lines:
        if line.strip():
            try:
                token, stem, lemma = line.strip().split('\t')
            except:
                continue
            cleaned_token = clean_text(token)
            if cleaned_token and cleaned_token not in stop_words:
                tokens.append(cleaned_token)
    
    return tokens

In [141]:
import os

def process_directory(directory_path):
    all_tokens = []

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"Process file {file_path}")
            tokens = process_file(file_path)
            all_tokens.append(tokens)

    return all_tokens

In [None]:
tokens = process_directory(dataset_path)
# print(len(tokens))

print(tokens[0])

In [143]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4)

In [144]:
def get_sentences(content):
    sentence_endings_pattern = re.compile(r'(?<!\w\.\w.)(?<!\w\. \w.)(?<![A-Z][a-z]\.)(?<!\s\.\s)(?<=\.|\?|\!)\s(?![A-Z][A-Za-z]\.)(?!\w\. \w.)|(?<![,])\n(?![a-zA-Z0-9])')
    sentences = sentence_endings_pattern.split(content)

    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    sentences_included_key_value = []

    for sentence in sentences:
        key_value_pattern = re.compile(r'[A-Z](\w+-)*\w+(\s\w+)*:[ ]*([\W\w]+[ ,])+(?!,\n)')
        if key_value_pattern.match(sentence):
            add_sentence = re.split(r'(?<!,)\n', sentence)
            sentences_included_key_value.extend(add_sentence)
        else:
            sentences_included_key_value.append(sentence)
    # sentences = [sentence.replace('\n', ' ') for sentence in sentences]
    return sentences_included_key_value

def tokenize_sentence(sentence):
    pattern = r'\+?\d[\d\-\(\)\s]{7,}\d' \
              r'|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' \
              r'|\b(?:Mr|Ms|Mrs|Dr|Prof|St)\.\s[A-Z][a-z]+' \
              r'|\b\d{1,2}:\d{2}\s?(?:[AaPp]\.?[Mm]\.?)\b' \
              r'|\w+|[^\w\s]'
    tokens = re.findall(pattern, sentence)
    tokens = [token.lower() for token in tokens]
    return tokens

In [145]:
import numpy as np

def vectorize_text(text, model):
    sentences = get_sentences(text)
    sentence_vectors = []
    
    for sentence in sentences:
        cleaned_text = clean_text(sentence)
        tokens = tokenize_sentence(cleaned_text)
        if len(tokens) == 0:
            continue
        word_vectors = [model.wv[word] if word in model.wv else np.zeros(100) for word in tokens]
        sentence_vector = np.mean(word_vectors, axis=0)
        sentence_vectors.append(sentence_vector)
        
    text_vector = np.mean(sentence_vectors, axis=0)
    
    return text_vector

In [None]:
text = "programs applications developing. Sentence"
file_tsv_content = {}

vector = vectorize_text(text, model)
print(len(vector))
print(vector)

programs = applications, developing (programming)
interface, buggy, functions, UNIX, code, signals, Internet
people, Canada, sport, tatoo


email = phone, FAX
address, message, contact
team, games, baseball

lecture = seminar, courses, 
education, laboratories, conference
situation, comments, season

In [147]:
from scipy.spatial.distance import cosine

def cosine_similarity(vector_a, vector_b):
    return 1 - cosine(vector_a, vector_b)

def calculate_distances(word, similar_words, domain_words, different_words, model, cosine_similarity_function):
    word_vector = model.wv[word]
    distances = []
    
    for group_name, words in [('Похожие слова', similar_words), 
                              ('Слова из той же области', domain_words), 
                              ('Совершенно другие слова', different_words)]:
        for w in words:
            if w in model.wv:
                distance = cosine_similarity_function(word_vector, model.wv[w])
                distances.append((group_name, w, distance))
    
    return sorted(distances, key=lambda x: x[2], reverse=True)

In [None]:
similar_words = ["applications", "developing"]
domain_words = ["interface", "buggy", "functions", "UNIX", "code", "signals", "Internet"]
different_words = ["people", "Canada", "sport", "tatoo"]

distances = calculate_distances('programs', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

In [None]:
similar_words = ["address", "FAX"]
domain_words = ["phone", "message", "contact"]
different_words = ["team", "games", "baseball"]

distances = calculate_distances('email', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

In [None]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

In [153]:
import numpy as np

def cosine_similarity_manual(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    return dot_product / (norm_a * norm_b)

In [None]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model, cosine_similarity_manual)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

In [165]:
def read_content(file_path):
    filename = file_path.split('/')[-1]
    try:
        with open(file_path, encoding="ISO-8859-1") as file:
            content = file.read()
            return content, filename
    except Exception as e:
        print(f"Couldn't read file {filename}: {e}")
        return None

def vectorize_file(file_path):
    content, filename = read_content(file_path)
    vector = vectorize_text(content, model)
    
    vector_str = "\t".join([str(component) for component in vector])
    result_line = f"{filename}\t{vector_str}"
    
    return result_line

def vectorize_directory(directory_path, file_name):
    all_lines = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            line = vectorize_file(file_path)
            all_lines.append(line)
            
    tsv_filepath = os.path.join('..', 'assets', 'annotated-corpus', file_name + '.tsv')

    os.makedirs(os.path.dirname(tsv_filepath), exist_ok=True)

    with open(tsv_filepath, 'w') as f:
        f.write("\n".join(all_lines))

In [166]:
dataset_path = "../../dataset/20news-bydate-test"

vectorize_directory(dataset_path, 'test')

In [167]:
dataset_path = "../../dataset/20news-bydate-train"

vectorize_directory(dataset_path, 'train')