In [1]:
# https://github.com/Ko4eBHuK/nlp-23-spring/tree/main/tasks/task-03
!python --version

Python 3.9.16


In [45]:
import regex as re
import os
import nltk
from nltk.corpus import stopwords
from google.colab import drive
import pandas as pd


drive.mount('/content/gdrive')

nltk.download('stopwords')

assets_url = 'gdrive/My Drive/Colab Notebooks/nlp-2023/assets/train/'
assets_generated_url = 'gdrive/My Drive/Colab Notebooks/nlp-2023/assets/task-3/'
stops = set(stopwords.words('english'))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
token_freq_dict = dict()
file_names_contents_dict = dict()
S = 0

for address, dirs, files in os.walk(assets_url):
  for file_name in files:
    file_content = []
    with open(os.path.join(address, file_name), mode='r') as annotated_document_file:
      for sentence in annotated_document_file.read().split('\n\n'):
        for annotation in sentence.split('\n'):
          word_stem_lem = annotation.split('\t')
          if len(word_stem_lem) == 3:
            token = word_stem_lem[0]
            # Очистить полученные данные от знаков пунктуации. Можно использовать регулярное выражение: [^\P{P}-]+;
            if not re.match('[^\P{P}-]+', token):
              token = re.sub('[^\P{P}-]+', '', token) # Привести полученные данные к нижнему регистру;
              # Очистить полученные данные от стоп слов. Можно использовать nltk.corpus.stopwords;
              if not token in stops:
                # Словарь токенов с их частотами по всем данным
                token_freq_dict[token] = token_freq_dict.get(token, 0) + 1
                file_content.append(token)
                S += 1
    file_names_contents_dict[file_name] = file_content

In [40]:
# save files-contents dict
with open(assets_generated_url + 'file-content-dict.tsv', 'w') as tsvfile:
        for key, value in file_names_contents_dict.items():
            tsvfile.write(f"{key}\t{value}\n")

In [52]:
# read file-content-dict, token_freq and term-doc-matrix from files
file_names_contents_df = pd.read_table(assets_generated_url + 'file-content-dict.tsv', )
tokens_freq_df = pd.read_csv(assets_generated_url + 'token-freq.csv')

In [None]:
term_doc_matrix_df = pd.read_csv(assets_generated_url + 'term-doc-matrix.csv', index_col=0)

In [53]:
file_names_contents_df

Unnamed: 0,file_name,content
0,115829.tsv,"['Alberta', 'says', 'public', 'inquiry', 'Calg..."
1,115832.tsv,"['Abbas', 'Calls', 'Palestinians', 'Drop', 'Ar..."
2,115843.tsv,"['Worlds', 'Tallest', 'Bridge', 'Inaugurated',..."
3,115845.tsv,"['Iran', 'May', 'Negotiate', 'With', 'US', 'Ov..."
4,115846.tsv,"['Syria', 'Bomb', 'shows', 'Israel', 'want', '..."
...,...,...
119995,3012.tsv,"['Juniper', 'engineers', 'depart', 'start', '-..."
119996,3013.tsv,"['Real', 'gets', 'flamed', 'iPod', 'campaign',..."
119997,3014.tsv,"['Hynix', 'build', 'chip', 'plant', 'China', '..."
119998,3015.tsv,"['Researchers', 'find', 'holes', 'XP', 'SP2', ..."


In [None]:
tokens_freq_df

In [None]:
term_doc_matrix_df

In [None]:
# 1. Результаты необходимо сохранить во внешние файлы в произвольном формате
  # Сохранение словаря частот токенов
import csv

token_freq_csv_file_url = assets_generated_url + 'token-freq.csv'

with open(token_freq_csv_file_url, 'w') as csvfile:
        csvfile.write('token,frequency\n')
        for key, value in token_freq_dict.items():
            csvfile.write(f"{key},{value}\n")

In [22]:
top_2k_tokens_by_freq = dict(sorted(dict(zip(tokens_freq_df.token, tokens_freq_df.frequency)).items(), key=lambda x: x[1], reverse=True)[:2000])

In [55]:
first_50k_docs_with_content = dict(list(dict(zip(file_names_contents_df.file_name, file_names_contents_df.content)).items())[:50000])

In [None]:
first_50k_docs_with_content

In [None]:
# generate term-doc matrix
term_doc_matrix = pd.DataFrame(data=0, index=first_50k_docs_with_content.keys(), columns=top_2k_tokens_by_freq.keys())

for doc_name, tokens in first_50k_docs_with_content.items():
  all_tokens = len(tokens)
  for token in tokens:
    if token in top_2k_tokens_by_freq.keys():
      term_doc_matrix[token][doc_name] += 1

# 1. Результаты необходимо сохранить во внешние файлы в произвольном формате
  # Сохранение term-doc-matrix
term_doc_matrix.to_csv(assets_generated_url + 'term-doc-matrix.csv')

In [57]:
# 2. Реализовать один из базовых методов векторизации произвольного текста
# Разработать метод, позволяющий преобразовать произвольный текст в вектор значений tf-idf,
  # с использованием словаря наиболее частых слов и матрицы "термин-документ", полученных ранее (на шаге 1);

import math

def get_TF_vec_of_doc(doc_text, available_tokens):
  doc_tokens = re.findall(r'[^\s.!?\-;:]+', doc_text)
  TF_vec = pd.DataFrame(data=0, index=[0], columns=available_tokens)
  for doc_token in doc_tokens:
    if doc_token in available_tokens:
      TF_vec[doc_token] += 1
  for token in available_tokens:
    TF_vec[token] = TF_vec[token]/len(available_tokens)
  return TF_vec

def getIDF_vec(term_doc_matrix, available_tokens):
  IDF_vec = pd.DataFrame(data=0, index=[0], columns=available_tokens.keys())
  for token in available_tokens:
    for index, row in term_doc_matrix.iterrows():
      if row[token] != 0:
        IDF_vec[token] += 1
  for token in available_tokens:
    if IDF_vec[token][0] != 0:
      IDF_vec[token] = math.log(term_doc_matrix.shape[0] / IDF_vec[token])
  return IDF_vec

In [61]:
get_TF_vec_of_doc('Bebkiovan shmebra wants new anime. Such  two two tow W W W W W W W W W W W W W W W W Wunicly, only it! Reks shemks gres.', top_2k_tokens_by_freq.keys())

Unnamed: 0,-,The,US,said,new,A,year,first,company,two,...,century,traffic,W,seeded,semi,rain,vendors,hour,stadium,death
0,0.0,0.0,0.0,0.0,0.0005,0.0,0.0,0.0,0.0,0.001,...,0.0,0.0,0.008,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
term_doc_matrix.head()

Unnamed: 0,-,The,US,said,new,A,year,first,company,two,...,century,traffic,W,seeded,semi,rain,vendors,hour,stadium,death
116024.tsv,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116028.tsv,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
116031.tsv,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116032.tsv,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116033.tsv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
getIDF_vec(term_doc_matrix.head(150), top_2k_tokens_by_freq)

Unnamed: 0,-,The,US,said,new,A,year,first,company,two,...,century,traffic,W,seeded,semi,rain,vendors,hour,stadium,death
0,0.415515,1.544899,4.317488,3.401197,2.525729,3.064725,2.931194,2.445686,5.010635,3.401197,...,0,0,0,0,0,0,0,4.317488,3.401197,0


In [66]:
# 3. Реализовать метод, позволяющий векторизовать произвольный текст с использованием нейронных сетей (предлагается использовать стандартную реализацию модели w2v или glove). 
# Выбранную модель необходимо обучить на обучающей выборке.
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

In [74]:
list_of_lists_in_string_representation = list(first_50k_docs_with_content.values())

In [81]:
data = []
for list_in_string_representation in list_of_lists_in_string_representation:
  list_in_normal_form = list_in_string_representation.strip('][').split(', ')
  clear_list = []
  for word in list_in_normal_form:
    word_clear = word[1:]
    word_clear = word_clear[:-1]
    clear_list.append(word_clear)
  data.append(clear_list)

In [None]:
data

In [83]:
w2v_model = Word2Vec(
    min_count=10,
    window=2,
    vector_size=300,
    negative=10,
    alpha=0.03,
    min_alpha=0.0007,
    sample=6e-5,
    sg=1)

In [84]:
w2v_model.build_vocab(data)

In [85]:
w2v_model.train(data, total_examples=w2v_model.corpus_count, epochs=5, report_delay=1)

(4181736, 6883220)

In [87]:
w2v_model.save(assets_generated_url + 'word2vec.model')

In [88]:
w2v_model = Word2Vec.load(assets_generated_url + 'word2vec.model')

In [None]:
w2v_model.wv.get_vector('walk')

In [131]:
# 4. С использованием библиотечной реализации метода подсчета косинусного расстояния между векторными представлениями текста, 
# продемонстрировать на примерах, что для семантически близких слов модель генерирует вектора, для которых косинусное расстояние меньше, чем для семантически далеких токенов

# изменяется от 0 до 1 и чем больше - тем лсова более похожи
from scipy import spatial
def cosine_lib(a, b):
    return 1 - spatial.distance.cosine(a, b)

def cosinus_similarity_of_vectors(vec_1, vec_2):
  min_vec_size = len(vec_1)
  if min_vec_size > len(vec_2):
    min_vec_size = len(vec_2)
  dot_product_of_vectors = 0
  vec_1_magnitude = 0
  vec_2_magnitude = 0
  for vec_param_index in range(0,min_vec_size):
    dot_product_of_vectors += vec_1[vec_param_index] * vec_2[vec_param_index]
    vec_1_magnitude += vec_1[vec_param_index] * vec_1[vec_param_index]
    vec_2_magnitude += vec_2[vec_param_index] * vec_2[vec_param_index]
  vec_1_magnitude = math.sqrt(vec_1_magnitude)
  vec_2_magnitude = math.sqrt(vec_2_magnitude)

  return dot_product_of_vectors / (vec_1_magnitude * vec_2_magnitude)

In [132]:
set_of_examples = {
	'step': [['trip', 'walk'], ['count', 'destination'], ['injured', 'center']],
	'coach': [['father', 'player'], ['hattrick', 'postgame', 'scoreboard'], ['engagement', 'refund']],
	'photo': [['picture', 'image', 'painting'], ['file', 'art'], ['postgame', 'wet']],
	'ocean': [['lake', 'water', 'river'], ['fish', 'boat'], ['snap', 'Design']]
}

for example_word, tests in set_of_examples.items():
  print(f"analyzing word - {example_word}")
  exmaple_word_vec = w2v_model.wv.get_vector(example_word)
  print("\tsame meaning words scores:")
  for same_meaning_word in tests[0]:
    same_meaning_word_vec = w2v_model.wv.get_vector(same_meaning_word)
    my_cos_dist = cosinus_similarity_of_vectors(exmaple_word_vec, same_meaning_word_vec)
    lib_cos_dist = cosine_lib(exmaple_word_vec, same_meaning_word_vec)
    print(f"\t\tdistance for {same_meaning_word}")
    print(f"\t\t\t\tmy  = {my_cos_dist}")
    print(f"\t\t\t\tlib = {lib_cos_dist}")
  print("\tsame scope words scores:")
  for same_scope_word in tests[1]:
    same_scope_word_vec = w2v_model.wv.get_vector(same_meaning_word)
    my_cos_dist = cosinus_similarity_of_vectors(exmaple_word_vec, same_scope_word_vec)
    lib_cos_dist = cosine_lib(exmaple_word_vec, same_scope_word_vec)
    print(f"\t\tdistance for {same_scope_word}")
    print(f"\t\t\t\tmy  = {my_cos_dist}")
    print(f"\t\t\t\tlib = {lib_cos_dist}")
  print("\tdifferent scope words scores:")
  for diff_scope_word in tests[2]:
    diff_scope_word_vec = w2v_model.wv.get_vector(same_meaning_word)
    my_cos_dist = cosinus_similarity_of_vectors(exmaple_word_vec, diff_scope_word_vec)
    lib_cos_dist = cosine_lib(exmaple_word_vec, diff_scope_word_vec)
    print(f"\t\tdistance for {diff_scope_word}")
    print(f"\t\t\t\tmy  = {my_cos_dist}")
    print(f"\t\t\t\tlib = {lib_cos_dist}")

analyzing word - step
	same meaning words scores:
		distance for trip
				my  = 0.6647100597435188
				lib = 0.6647101044654846
		distance for walk
				my  = 0.7004155229419728
				lib = 0.7004154920578003
	same scope words scores:
		distance for count
				my  = 0.7004155229419728
				lib = 0.7004154920578003
		distance for destination
				my  = 0.7004155229419728
				lib = 0.7004154920578003
	different scope words scores:
		distance for injured
				my  = 0.7004155229419728
				lib = 0.7004154920578003
		distance for center
				my  = 0.7004155229419728
				lib = 0.7004154920578003
analyzing word - coach
	same meaning words scores:
		distance for father
				my  = 0.5633526901537409
				lib = 0.5633527040481567
		distance for player
				my  = 0.6117776179980862
				lib = 0.611777663230896
	same scope words scores:
		distance for hattrick
				my  = 0.6117776179980862
				lib = 0.611777663230896
		distance for postgame
				my  = 0.6117776179980862
				lib = 0.611777663230896
		distance for

In [None]:
set_of_examples

In [None]:
#step - frequency, stride, trip - count, destination - injured, center
#coach - father, player - hattrick, postgame, scoreboard - engagement, refund
#photo - picture, image, painting - file, art - postgame, wet
#ocean - lake, water, river - fish, boat - snap, Design
#heritage - right, tradition - refund, ancestor, engagement - cuff, relocation, county 

In [None]:
# 5. Применить какой-либо метод сокращения размерностей полученных одним из базовых способов векторизации, выбранным ранее (см. пункт 2), векторов

In [None]:
# 6. С использованием разработанного метода подсчета косинусного расстояния сравнить эффективность метода векторизации с использованием 
#  нейронных сетей и эффективность базовых методов векторизации с последующим сокращением размерности.
# Сформулировать вывод о том, применение какого способа позволяет получить лучшие результаты на выбранном датасете.

In [None]:
# 7. Реализовать метод, осуществляющий векторизацию произвольного текста по следующему алгоритму: https://github.com/Ko4eBHuK/nlp-23-spring/tree/main/tasks/task-03

In [None]:
# 8. Выполнить векторизацию тестовой выборки с использованием метода, реализованного на предыдущем шаге.