In [53]:
dataset_path_file = "../../nechkasova-tokenizer/assets/annotated-corpus/train/alt.atheism/49960.tsv"
dataset_path = "../../nechkasova-tokenizer/assets/annotated-corpus/train"

In [54]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lesya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
import re
import string

def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

def clean_text(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    return text.lower()

In [56]:
def process_file(file_path):
    lines = read_data(file_path)
    tokens = []

    for line in lines:
        if line.strip():
            try:
                token, stem, lemma = line.strip().split('\t')
            except:
                continue
            cleaned_token = clean_text(token)
            if cleaned_token and cleaned_token not in stop_words:
                tokens.append(cleaned_token)
    
    return tokens

In [57]:
import os

def process_directory(directory_path):
    all_tokens = []

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            # print(f"Process file {file_path}")
            tokens = process_file(file_path)
            all_tokens.append(tokens)

    return all_tokens

In [58]:
# tokens = process_file(dataset_path_file)
tokens = process_directory(dataset_path)
# print(len(tokens))

In [59]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4)

In [60]:
def get_sentences(content):
    sentence_endings_pattern = re.compile(r'(?<!\w\.\w.)(?<!\w\. \w.)(?<![A-Z][a-z]\.)(?<!\s\.\s)(?<=\.|\?|\!)\s(?![A-Z][A-Za-z]\.)(?!\w\. \w.)|(?<![,])\n(?![a-zA-Z0-9])')
    sentences = sentence_endings_pattern.split(content)

    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    sentences_included_key_value = []

    for sentence in sentences:
        key_value_pattern = re.compile(r'[A-Z](\w+-)*\w+(\s\w+)*:[ ]*([\W\w]+[ ,])+(?!,\n)')
        if key_value_pattern.match(sentence):
            add_sentence = re.split(r'(?<!,)\n', sentence)
            sentences_included_key_value.extend(add_sentence)
        else:
            sentences_included_key_value.append(sentence)
    # sentences = [sentence.replace('\n', ' ') for sentence in sentences]
    return sentences_included_key_value

def tokenize_sentence(sentence):
    pattern = r'\+?\d[\d\-\(\)\s]{7,}\d' \
              r'|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' \
              r'|\b(?:Mr|Ms|Mrs|Dr|Prof|St)\.\s[A-Z][a-z]+' \
              r'|\b\d{1,2}:\d{2}\s?(?:[AaPp]\.?[Mm]\.?)\b' \
              r'|\w+|[^\w\s]'
    tokens = re.findall(pattern, sentence)
    tokens = [token.lower() for token in tokens]
    return tokens

In [102]:
import numpy as np

def vectorize_text(text, model):
    sentences = get_sentences(text)
    sentence_vectors = []
    
    for sentence in sentences:
        cleaned_text = clean_text(sentence)
        tokens = tokenize_sentence(cleaned_text)
        if len(tokens) == 0:
            continue
        word_vectors = [model.wv[word] if word in model.wv else np.zeros(100) for word in tokens]
        sentence_vector = np.mean(word_vectors, axis=0)
        sentence_vectors.append(sentence_vector)
        
    text_vector = np.mean(sentence_vectors, axis=0)
    
    return text_vector

In [90]:
text = "programs applications developing. Sentence"
file_tsv_content = {}

vector = vectorize_text(text, model)
print(len(vector))
print(vector)

100
[ 0.06603803  0.04041414  0.2965158   0.54403996  0.40739977 -0.44551393
  0.1606691   0.5753212  -0.1143528  -0.21388742  0.04817864  0.01289042
 -0.13808554  0.02562388 -0.15186119 -0.23477179 -0.20436378 -0.24522659
  0.05454374 -0.85502017  1.1733248   0.04736743  0.897901    0.12461732
 -0.5895696  -0.4276418   0.34620756 -0.32519758 -0.5270321   0.41284737
  0.21516019 -0.01142962  0.41720778 -0.62410635  0.29935935  0.3833707
 -0.23130068 -0.6496782  -0.87836826 -0.44881403  0.4692001  -0.6003808
 -0.74882674 -0.21054474  0.51244307  0.11857323 -0.12278455  0.2909257
  0.28487986 -0.4391346   0.02587885 -0.22700348 -0.2843613   0.29164603
 -1.0175967  -0.18673402 -0.26023012 -0.48501605 -0.3140038  -0.22755022
  0.6441806   0.21569479 -0.06850782 -0.23569383 -0.15789339  0.40922463
 -0.06177444  0.6769148  -0.7453899   0.5271795  -0.11539032  0.7397048
  0.03392206 -0.30380297  0.5485276  -0.07419986  0.32635325  0.02989756
 -1.1561422   0.06447767 -0.41148335 -0.25974905 -0

programs = applications, developing (programming)
interface, buggy, functions, UNIX, code, signals, Internet
people, Canada, sport, tatoo


email = phone, FAX
address, message, contact
team, games, baseball

lecture = seminar, courses, 
education, laboratories, conference
situation, comments, season

In [106]:
from scipy.spatial.distance import cosine

def cosine_similarity(vector_a, vector_b):
    return 1 - cosine(vector_a, vector_b)

def calculate_distances(word, similar_words, domain_words, different_words, model, cosine_similarity_function):
    word_vector = model.wv[word]
    distances = []
    
    for group_name, words in [('Похожие слова', similar_words), 
                              ('Слова из той же области', domain_words), 
                              ('Совершенно другие слова', different_words)]:
        for w in words:
            if w in model.wv:
                distance = cosine_similarity_function(word_vector, model.wv[w])
                distances.append((group_name, w, distance))
    
    return sorted(distances, key=lambda x: x[2], reverse=True)

In [107]:
similar_words = ["applications", "developing"]
domain_words = ["interface", "buggy", "functions", "UNIX", "code", "signals", "Internet"]
different_words = ["people", "Canada", "sport", "tatoo"]

distances = calculate_distances('programs', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: applications - Косинусное сходство: 0.9429
Слова из той же области: interface - Косинусное сходство: 0.8686
Слова из той же области: functions - Косинусное сходство: 0.8412
Слова из той же области: code - Косинусное сходство: 0.7927
Похожие слова: developing - Косинусное сходство: 0.7788
Слова из той же области: signals - Косинусное сходство: 0.7311
Слова из той же области: buggy - Косинусное сходство: 0.7004
Совершенно другие слова: sport - Косинусное сходство: 0.6844
Совершенно другие слова: tatoo - Косинусное сходство: 0.5245
Совершенно другие слова: people - Косинусное сходство: 0.4397


In [69]:
similar_words = ["address", "FAX"]
domain_words = ["phone", "message", "contact"]
different_words = ["team", "games", "baseball"]

distances = calculate_distances('email', similar_words, domain_words, different_words, model)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: address - Косинусное сходство: 0.8983
Слова из той же области: contact - Косинусное сходство: 0.8750
Слова из той же области: phone - Косинусное сходство: 0.7528
Слова из той же области: message - Косинусное сходство: 0.7058
Совершенно другие слова: baseball - Косинусное сходство: 0.2184
Совершенно другие слова: team - Косинусное сходство: 0.0141
Совершенно другие слова: games - Косинусное сходство: 0.0112


In [70]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: seminar - Косинусное сходство: 0.9147
Похожие слова: courses - Косинусное сходство: 0.8138
Слова из той же области: education - Косинусное сходство: 0.6976
Совершенно другие слова: situation - Косинусное сходство: 0.6668
Слова из той же области: laboratories - Косинусное сходство: 0.6318
Совершенно другие слова: comments - Косинусное сходство: 0.6094
Слова из той же области: conference - Косинусное сходство: 0.5666
Совершенно другие слова: season - Косинусное сходство: 0.4208


In [109]:
import numpy as np

def cosine_similarity_manual(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    return dot_product / (norm_a * norm_b)

In [110]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model, cosine_similarity_manual)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: seminar - Косинусное сходство: 0.9147
Похожие слова: courses - Косинусное сходство: 0.8138
Слова из той же области: education - Косинусное сходство: 0.6976
Совершенно другие слова: situation - Косинусное сходство: 0.6668
Слова из той же области: laboratories - Косинусное сходство: 0.6318
Совершенно другие слова: comments - Косинусное сходство: 0.6094
Слова из той же области: conference - Косинусное сходство: 0.5666
Совершенно другие слова: season - Косинусное сходство: 0.4208


In [104]:
def read_content(file_path):
    filename = file_path.split('\\')[-1]
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            return content, filename
    except Exception as e:
        print(f"Couldn't read file {filename}: {e}")
        return None

def vectorize_file(file_path):
    content, filename = read_content(file_path)
    vector = vectorize_text(content, model)
    
    vector_str = "\t".join([str(component) for component in vector])
    result_line = f"{filename}\t{vector_str}"
    
    return result_line

def vectorize_directory(directory_path):
    all_lines = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            line = vectorize_file(file_path)
            all_lines.append(line)
            
    tsv_filepath = os.path.join('..', 'assets', 'annotated-corpus', 'test.tsv')

    os.makedirs(os.path.dirname(tsv_filepath), exist_ok=True)

    with open(tsv_filepath, 'w') as f:
        f.write("\n".join(all_lines))

In [105]:
dataset_path = "../../../../probe/dataset/20news-bydate-test"

vectorize_directory(dataset_path)