# Laboratory work #3 (text vectorization)

In [None]:
import re
import os
from collections import defaultdict, Counter
import string

from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


import matplotlib.pyplot as plt
from pandas.errors import EmptyDataError
import numpy as np
from math import log1p
import gensim

In [None]:
def read_files(root_dir, n=None):
    file_paths = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        try:
            d = pd.read_csv(file_path, sep='\t', header=None)
            d.columns = ['Token', 'Stem', 'Lemma']
        except EmptyDataError as e:
            print(i, file_path, e)
        data.append(d.dropna())
        
    
    ids = [os.path.splitext(os.path.basename(path))[0] for path in file_paths]
    return ids, data

In [None]:
train_ids, train = read_files('../assets/annotated-corpus/train', 
                  #  1000
                   )
# val_ids, val = read_files('../assets/annotated-corpus/val', 
#                 #  100
#                  )
test_ids, test = read_files('../assets/annotated-corpus/test', 
                #   100
                  )

In [None]:
train[0].head()

In [None]:
string.punctuation

In [None]:
def is_valid_token(token, token_frequencies, min_frequency=2):
    if token in string.punctuation:
        return False
    if token.lower() in stop_words:
        return False
    if token_frequencies[token] < min_frequency:
        return False
    return True


def get_freqs(dfs):
    token_frequencies = Counter()
    term_document_matrix = defaultdict(lambda: defaultdict(int))

    for doc_id, df in enumerate(dfs):
        tokens = df['Token'].tolist()
        token_frequencies.update(tokens)

        for token in tokens:
            if is_valid_token(token, token_frequencies):
                term_document_matrix[doc_id][token] += 1
                
    # filter all tokens that return is_valid_token False
    token_frequencies = Counter(dict({(token, freq) for (token, freq) in token_frequencies.items() if is_valid_token(token, token_frequencies)}))

    for doc_id, terms in term_document_matrix.items():
        term_document_matrix[doc_id] = {token: freq for token, freq in terms.items() if is_valid_token(token, token_frequencies)}

    return token_frequencies, term_document_matrix

In [None]:
token_frequencies, term_document_matrix = get_freqs(train)
token_frequencies.most_common(20)

In [None]:
term_document_matrix[0]

In [None]:
data_dir = Path('../assets/data/')
data_dir.mkdir(parents=True, exist_ok=True)
with open(data_dir / 'token_frequencies.tsv', 'w', encoding='utf-8') as file:
    for token, freq in token_frequencies.items():
        if is_valid_token(token, token_frequencies):
            file.write(f'{token}\t{freq}\n')

with open(data_dir / 'term_document_matrix.tsv', 'w', encoding='utf-8') as file:
    for doc_id, terms in term_document_matrix.items():
        for token, freq in terms.items():
            file.write(f'{doc_id}\t{token}\t{freq}\n')

In [None]:
data_dir = Path('../assets/data/')

token_frequencies = {}
with open(data_dir / 'token_frequencies.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        token, freq = line.strip().split('\t')
        token_frequencies[token] = int(freq)

term_document_matrix = {}
with open(data_dir / 'term_document_matrix.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        doc_id, token, freq = line.strip().split('\t')
        doc_id = int(doc_id)
        freq = int(freq)
        if doc_id not in term_document_matrix:
            term_document_matrix[doc_id] = {}
        term_document_matrix[doc_id][token] = freq

In [None]:
term_document_matrix[0]

In [None]:
def get_term_document_vector(token, term_document_matrix):
    vector = []
    for k, v in term_document_matrix.items():
        freq = v.get(token, 0)
        vector.append(freq)
    return vector

In [None]:
get_term_document_vector('Reuters', term_document_matrix)[:5]

In [None]:
get_term_document_vector('cat', term_document_matrix)[:5]

In [None]:
def preprocess_text(text):
    def split_into_sentences(text):
        # so the website will not split into two separate sentences by comma:
        sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
        sentences = sentence_endings.split(text)
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        return sentences
    
    def split_into_words(sentences):
        # regular expression to match complex URLs, simple URLs, hashtags, Twitter handles, and words
        word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|-?\w+\'?\w*')
        tokenized_sentences = []
        for sentence in sentences:
            words = word_pattern.findall(sentence)
            tokenized_sentences.append(words)
        return tokenized_sentences

    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [None]:
def compute_tf(sentence_tokens, token):
    return sentence_tokens.count(token) / len(sentence_tokens)


def compute_idf(token, term_document_matrix, total_documents):
    doc_count = sum(1 for doc in term_document_matrix if token in term_document_matrix[doc])
    return log1p(total_documents / (1 + doc_count))


def process_text_and_create_matrices(text, token_frequencies, term_document_matrix):
    tokenized_sentences = preprocess_text(text)
    total_documents = len(term_document_matrix)
    vocabulary = sorted(token_frequencies.keys())

    max_sentence_length = max(len(sentence) for sentence in tokenized_sentences)

    frequency_matrix = []
    tfidf_matrix = []

    for sentence in tokenized_sentences:
        sentence_freq_vector = [0] * max_sentence_length
        sentence_tfidf_vector = [0] * max_sentence_length

        for i, token in enumerate(sentence):
            if token in vocabulary:
                tf = compute_tf(sentence, token)
                idf = compute_idf(token, term_document_matrix, total_documents)

                sentence_freq_vector[i] = tf
                sentence_tfidf_vector[i] = tf * idf

        frequency_matrix.append(sentence_freq_vector)
        tfidf_matrix.append(sentence_tfidf_vector)

    frequency_matrix = np.array(frequency_matrix)
    tfidf_matrix = np.array(tfidf_matrix)

    document_vector_freq = np.mean(frequency_matrix, axis=0)
    document_vector_tfidf = np.mean(tfidf_matrix, axis=0)

    return document_vector_freq, document_vector_tfidf

In [None]:
text = 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '
print(text)

In [None]:
document_vector_freq, document_vector_tfidf = process_text_and_create_matrices(text, token_frequencies, term_document_matrix)
document_vector_freq.shape, document_vector_tfidf.shape

In [None]:
document_vector_freq

In [None]:
document_vector_tfidf

In [None]:
train_texts = [[token for token in ds['Token'].to_list() if token in token_frequencies.keys() and is_valid_token(token, token_frequencies)] for ds in train]

In [None]:
train_texts[0][:10]

In [None]:
model = gensim.models.Word2Vec(sentences=train_texts, vector_size=30, window=5, min_count=2, workers=4)

In [None]:
Path('../models/').mkdir(parents=True, exist_ok=True)
model_path = '../models/word2vec.model'
model.save(model_path)

In [None]:
print('Word:', token_frequencies['Monday'])
print('Close:', token_frequencies['Tuesday'], token_frequencies['Wednesday'], token_frequencies['Thursday'])
print('Same area', token_frequencies['weekend'], token_frequencies['day'], token_frequencies['week'])
print('Other semantic', token_frequencies['funds'], token_frequencies['town'], token_frequencies['territory'])

In [None]:
print('Word:', token_frequencies['north'])
print('Close:', token_frequencies['south'], token_frequencies['west'], token_frequencies['east'])
print('Same area', token_frequencies['world'], token_frequencies['side'], token_frequencies['direction'])
print('Other semantic', token_frequencies['party'], token_frequencies['senator'], token_frequencies['husband'])

In [None]:
print('Word:', token_frequencies['Spain'])
print('Close:', token_frequencies['Madrid'], token_frequencies['Catalonia'], token_frequencies['Europe'])
print('Same area', token_frequencies['Brexit'], token_frequencies['kingdom'], token_frequencies['EU'])
print('Other semantic', token_frequencies['Trump'], token_frequencies['Twitter'], token_frequencies['Korea'])

In [None]:
def cosine_similarity(vec_a, vec_b):
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)


words_to_analyze = ['Monday', 'north', 'Spain']
similar_words = {
    'Monday': ['Tuesday', 'Wednesday', 'Thursday'], 
    'north': ['south', 'west', 'east'],
    'Spain': ['Madrid', 'Catalonia', 'Europe']
}

related_words = {
    'Monday': ['weekend', 'day', 'week'], 
    'north': ['world', 'side', 'direction'],
    'Spain': ['Brexit', 'kingdom', 'EU']
}

unrelated_words = {
    'Monday': ['funds', 'town', 'territory'], 
    'north': ['party', 'senator', 'husband'],
    'Spain': ['Trump', 'Twitter', 'Korea']
}

for word in words_to_analyze:
    word_vec = model.wv[word]
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, model.wv[target_word]) for target_word in words[word]}
        print(f'\t{group}: {distances}')

In [None]:
for word in words_to_analyze:
    word_vec = get_term_document_vector(word, term_document_matrix)
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, get_term_document_vector(target_word, term_document_matrix)) for target_word in words[word]}
        print(f'\t{group}: {distances}')

In [None]:
term_document_df = np.zeros((len(token_frequencies), len(term_document_matrix)))

In [None]:
term_document_df.shape

In [None]:
for i, term in enumerate(token_frequencies.keys()):
    if i % 1000 == 0:
        print(i)
    term_document_df[i, :] = np.array(get_term_document_vector(term, term_document_matrix), dtype=np.float16)

In [None]:
# term_document_df_ = term_document_df[:5000, :]

In [None]:
from sklearn.decomposition import PCA


n_components = 30

pca = PCA(n_components=n_components)
reduced_tfidf_vectors = pca.fit_transform(term_document_df)
reduced_tfidf_vectors.shape

In [None]:
np.save('../assets/reduced_tfidf_vectors.npy', reduced_tfidf_vectors)

In [None]:
reduced_tfidf_vectors = pd.DataFrame.from_records(reduced_tfidf_vectors)

In [None]:
reduced_tfidf_vectors.index = list(token_frequencies.keys())

In [None]:
reduced_tfidf_vectors.head()

In [None]:
for word in words_to_analyze:
    try:
        word_vec = reduced_tfidf_vectors.loc[word]
        print(f'Cosine distances for "{word}":')
        for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
            distances = {target_word: cosine_similarity(word_vec, reduced_tfidf_vectors.loc[target_word]) for target_word in words[word]}
            print(f'\t{group}: {distances}')
    except:
        print('no words')

In [None]:
def vectorize_with_w2v(text, model):
    tokenized_sentences = preprocess_text(text)
    sentence_vectors = []
    
    for sentence in tokenized_sentences:
        word_vectors = []
        
        for word in sentence:
            if word in model.wv.key_to_index:
                word_vector = model.wv[word]
                word_vectors.append(word_vector)
                
        if word_vectors:
                sentence_vector = np.mean(word_vectors, axis=0)
                sentence_vectors.append(sentence_vector)

    if sentence_vectors:
        document_vector = np.mean(sentence_vectors, axis=0)
        return document_vector
    else:
        return np.zeros(model.vector_size)

In [None]:
text = ' '.join(train_texts[0])
print(text)
print(vectorize_with_w2v(text, model).shape)

In [None]:
test_texts = [[token for token in ds['Token'].to_list() if token in token_frequencies.keys() and is_valid_token(token, token_frequencies)] for ds in test]

In [None]:
test_texts[0][:10]

In [None]:
test_vectors = [vectorize_with_w2v(' '.join(text), model) for text in test_texts]

In [None]:
test_vectors[0].shape

In [None]:
with open('../assets/annotated-corpus/test-embeddings.tsv', 'w') as file:
    for doc_id, vector in zip(test_ids, test_vectors):
        vector_str = '\t'.join(map(str, vector))
        file.write(f'{doc_id}\t{vector_str}\n')