In [None]:
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

sentences = list()
sentence = list()
with open('annotations_hsy2.tsv') as file:
  for line in file:
      if line != "\n":
        sentence.append(line.split('\t')[1])
      if line == "\n" and sentence:
        sentences.append(sentence)
        sentence = list()

stop_words = set(stopwords.words('english'))
clear_data = []

In [None]:
def clean_sentence(sentence):
    cleaned = []
    for lemma in sentence:
        cleaned_tek = re.sub(r"[^\w\s]|[\d]", "", lemma.lower())
        if len(cleaned_tek) != len(lemma) or len(cleaned_tek) <3:
          continue;

        if cleaned_tek and cleaned_tek not in stop_words:
            cleaned.append(cleaned_tek)
    return cleaned

for sentence in sentences:
    clear_data.append(clean_sentence(sentence))
print(clear_data)

In [4]:
from gensim.models import Word2Vec
w2v = Word2Vec(sentences=clear_data, epochs=70, window=4, min_count=3)
w2v.wv.most_similar("cpu")

words = list(w2v.wv.key_to_index.keys())
print("All words in model:", words)

word_vector = w2v.wv['astronomers'] 
print("Victor of word 'astronomers':", word_vector)

4. С использованием библиотечной реализации метода подсчета косинусного расстояния между векторными представлениями текста

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def custom_cosine_distance(vec_a, vec_b):
    """
    Calculate the cosine distance between two vectors (ranging from 0 to 1).
    The formula is: (1 - dot_product(vec_a, vec_b) / (norm(vec_a) * norm(vec_b))) / 2
    """
    return (1 - np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))) / 2

# Assume w2v is a pre-trained word vector model
w2v = None  # This should be replaced with the actual word vector model

target_word = "cpu"
similar_terms = ["memory", "hardware", "disk"]
related_terms = ["software", "coding", "keyboard"] 
unrelated_terms = ["philosophy", "hypothesis", "president"]

def print_distances(target, terms, description):
    """
    Print the cosine distances between the target word and a list of terms.
    
    :param target: The target word
    :param terms: A list of terms to compare with the target word
    :param description: A description of the type of terms being compared
    """
    print(f"Cosine distances ({description}):")
    for term in terms:
        distance = custom_cosine_distance(w2v.wv[target], w2v.wv[term])
        print(f"{target} - {term}: {distance:.4f}")
    print()

# Print cosine distances for different categories of terms
print_distances(target_word, similar_terms, "Similar Terms")
print_distances(target_word, related_terms, "Related Terms")
print_distances(target_word, unrelated_terms, "Unrelated Terms")

5. Применить какой-либо метод сокращения размерностей полученных одним из базовых способов векторизации

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

def plot_word_vectors(words, vector_data):
    """
    Plot the 2D representation of word vectors using PCA.

    :param words: List of words to plot
    :param vector_data: Dictionary or Word2Vec model containing word vectors
    """
    # Perform PCA to reduce the dimensionality to 2D
    pca_model = PCA(n_components=2)
    reduced_vectors = pca_model.fit_transform([vector_data[word] for word in words])

    # Convert the reduced vectors to a DataFrame
    reduced_df = pd.DataFrame(reduced_vectors, index=words, columns=["x_coord", "y_coord"])

    # Create a scatter plot using Seaborn
    scatter_plot = sns.scatterplot(data=reduced_df, x="x_coord", y="y_coord")

    # Add labels for each point
    for word, coordinates in reduced_df.iterrows():
        scatter_plot.text(coordinates["x_coord"], coordinates["y_coord"], word, fontsize=9, ha='right')

    # Set the title and axis labels
    plt.title("2D Representation of Word Vectors using PCA")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")

    # Display the plot
    plt.show()

# Example list of words to plot
words_to_plot = ["black", "white", "color", "philosophy", "religion", "computer", "pig", "dog"]

# Assume w2v is a pre-trained word vector model
w2v = None  # This should be replaced with the actual word vector model

# Call the function to plot the word vectors
plot_word_vectors(words_to_plot, w2v.wv)

6. Implement a method that vectorizes arbitrary text using the following algorithm

In [None]:
import numpy as np
def calculate_vectors(sentences, w2v):
    final_vector = np.zeros(w2v.vector_size)

    for sentence in sentences:
        current_sentence_vector = np.zeros(w2v.vector_size)

        for word in sentence:
            if word in w2v.wv.key_to_index:
                current_sentence_vector += w2v.wv[word]

        if len(sentence) > 0:
            current_sentence_vector /= len(sentence)

        final_vector += current_sentence_vector

    if len(sentences) > 0:
        final_vector /= len(sentences)

    return final_vector

calculate_vectors(clear_data, w2v)

In [39]:
import os
import chardet
from scipy.sparse import csr_matrix, save_npz

def detect_file_encoding(file_path):
    """
    Detect the encoding of a given file.

    :param file_path: Path to the file
    :return: Detected encoding
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(10000000))
    return result['encoding']

def process_text_file(file_path):
    """
    Process a text file to extract sentences.

    :param file_path: Path to the file
    :return: List of processed sentences
    """
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding=detect_file_encoding(file_path)) as file:
        for line in file:
            if line.strip():  # Check if the line is not empty
                current_sentence.append(line.split('\t')[1].strip())
            elif current_sentence:  # If the line is empty and we have a sentence
                sentences.append(current_sentence)
                current_sentence = []

    clean_sentences = [clean_sentence(sent) for sent in sentences]
    return clean_sentences

def clean_sentence(sentence):
    """
    Clean a single sentence by removing unnecessary characters or tokens.

    :param sentence: List of tokens in a sentence
    :return: Cleaned sentence
    """
    # Placeholder for actual cleaning logic
    return sentence

def save_vectors_to_tsv(vectors, output_path):
    """
    Save document vectors to a TSV file.

    :param vectors: Dictionary of document IDs and their corresponding vectors
    :param output_path: Path to the output TSV file
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for doc_id, vector in vectors.items():
            vector_str = '\t'.join(map(str, vector))
            f.write(f"{doc_id}\t{vector_str}\n")

def calculate_document_vectors(sentences, word_vectors):
    """
    Calculate the document vector based on the average of word vectors.

    :param sentences: List of processed sentences
    :param word_vectors: Word2Vec model or dictionary of word vectors
    :return: Document vector
    """
    # Placeholder for actual vector calculation logic
    document_vector = np.mean([word_vectors[word] for sentence in sentences for word in sentence if word in word_vectors], axis=0)
    return document_vector

def process_directory(directory_path, output_path):
    """
    Process all TSV files in a directory and save the document vectors to a TSV file.

    :param directory_path: Path to the directory containing TSV files
    :param output_path: Path to the output TSV file
    :return: Dictionary of document IDs and their corresponding vectors
    """
    document_vectors = {}

    for filename in os.listdir(directory_path):
        if filename.endswith(".tsv"):
            file_path = os.path.join(directory_path, filename)
            doc_id = os.path.splitext(os.path.basename(file_path))[0]
            processed_sentences = process_text_file(file_path)
            document_vector = calculate_document_vectors(processed_sentences, w2v)
            document_vectors[doc_id] = document_vector

    save_vectors_to_tsv(document_vectors, output_path)
    return document_vectors

# Example usage
directory_path = 'corpus'
output_tsv_path = 'space_vec.tsv'
w2v = None  # This should be replaced with the actual Word2Vec model

document_vectors = process_directory(directory_path, output_tsv_path)

In [11]:
flat_list = [item for sublist in clear_data for item in sublist]
unique_words = list(set(flat_list))

1 задание - сохранение матрицы термин документ

In [12]:
import os
import numpy as np
from collections import defaultdict, Counter
from scipy.sparse import csr_matrix

def process_directory(directory_path):
    """
    Process all TSV files in a directory and collect tokenized documents.

    :param directory_path: Path to the directory containing TSV files
    :return: Dictionary of document IDs and their corresponding token lists
    """
    document_texts = {}
    for filename in os.listdir(directory_path):
        if filename.endswith(".tsv"):
            file_path = os.path.join(directory_path, filename)
            doc_id = os.path.splitext(filename)[0]  # Remove file extension
            document_texts[doc_id] = [item for sublist in process_file(file_path) for item in sublist]
    return document_texts

def process_file(file_path):
    """
    Process a single TSV file to extract and tokenize sentences.

    :param file_path: Path to the TSV file
    :return: List of token lists
    """
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # Check if the line is not empty
                current_sentence.append(line.split('\t')[1].strip())
            elif current_sentence:  # If the line is empty and we have a sentence
                sentences.append(current_sentence)
                current_sentence = []

    return sentences

def create_term_document_matrix(doc_texts):
    """
    Create a term-document matrix from the tokenized documents.

    :param doc_texts: Dictionary of document IDs and their corresponding token lists
    :return: Term-document matrix, document index, token index, and token frequency
    """
    token_frequency = defaultdict(int)
    for doc_id, tokens in doc_texts.items():
        for token in tokens:
            token_frequency[token] += 1
    unique_tokens = list(token_frequency.keys())

    token_index = {token: i for i, token in enumerate(unique_tokens)}

    # Create a matrix to store data
    rows = []
    cols = []
    data = []

    doc_index = {doc_id: idx for idx, doc_id in enumerate(doc_texts.keys())}

    for doc_id, tokens in doc_texts.items():
        doc_idx = doc_index[doc_id]
        token_counts = Counter(tokens)
        for token, count in token_counts.items():
            if token in token_index:
                token_idx = token_index[token]
                rows.append(doc_idx)
                cols.append(token_idx)
                data.append(count)

    term_doc_matrix = csr_matrix((data, (rows, cols)), shape=(len(doc_texts), len(unique_tokens)))

    return term_doc_matrix, doc_index, token_index, token_frequency

def save_term_document_matrix_to_tsv(term_doc_matrix, doc_index, token_index, output_tsv_path, unique_tokens):
    """
    Save the term-document matrix to a TSV file.

    :param term_doc_matrix: Term-document matrix
    :param doc_index: Dictionary mapping document IDs to indices
    :param token_index: Dictionary mapping tokens to indices
    :param output_tsv_path: Path to the output TSV file
    :param unique_tokens: List of unique tokens
    """
    dense_matrix = term_doc_matrix.toarray()
    with open(output_tsv_path, 'w', encoding='utf-8') as tsv_file:
        # Write header
        header = ['doc_id'] + unique_tokens
        tsv_file.write('\t'.join(header) + '\n')

        # Write document rows
        for doc_id, doc_idx in doc_index.items():
            row = [doc_id] + list(map(str, dense_matrix[doc_idx]))
            tsv_file.write('\t'.join(row) + '\n')

        # Write total row
        token_sums = dense_matrix.sum(axis=0)
        sum_row = ['Total'] + list(map(str, token_sums))
        tsv_file.write('\t'.join(sum_row) + '\n')

# Example usage
directory_path = 'corpus'
output_tsv_path = 'term_doc_matrix.tsv'

document_texts = process_directory(directory_path)
term_doc_matrix, doc_index, token_index, token_frequency = create_term_document_matrix(document_texts)
unique_tokens = list(token_frequency.keys())
save_term_document_matrix_to_tsv(term_doc_matrix, doc_index, token_index, output_tsv_path, unique_tokens)