In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from math import log10
from gensim.models import Word2Vec
from sklearn.decomposition import PCA

In [None]:
train_dir = os.path.realpath("../assets/annotated-corpus/train")
result_dir = os.path.realpath("../assets/wordcount/train")

In [None]:
topics = os.listdir(train_dir)
topics

# Task 1

In [None]:
def get_stems_processed(filepath):
    sentences = []
    pattern = r"([A-Za-z]+[-.@]?)+\w*\.?"
    with open(filepath) as f:
        lines = "".join(f.readlines())
        sentences_raw = lines.split("\n\n")
        for s in sentences_raw:
            stems = []
            words = s.split("\n")
            if len(words) == 0 or words[0] == "":
                continue
            stems_raw = list(map(lambda x: x.split("\t")[1], words))
            lemmas = list(map(lambda x: x.split("\t")[2], words))
            for i in range(len(stems_raw)):
                if lemmas[i] not in stopwords.words("english") and re.match(pattern, lemmas[i]) is not None:
                    stems.append(stems_raw[i])
            sentences.append(stems)
    return sentences

In [None]:
def count_words_in_sentences(sentences):
    word_count = {}
    sentences_count = []
    for sentence in sentences:
        wbs_count = {}
        for w in sentence:
            if w not in wbs_count.keys():
                wbs_count[w] = 0
            wbs_count[w] += 1
            if w not in word_count.keys():
                word_count[w] = 0
            word_count[w] += 1
        sentences_count.append(wbs_count)
    return word_count

In [None]:
import datetime

all_documents = []
word_dict_raw = {}
cntr = 0
total_len = 0
for t in topics:
    total_len += len(os.listdir(os.path.join(train_dir, t)))
print(f"Total files for process: {total_len}")

for t in topics:
    workdir = os.path.join(train_dir, t)
    start_time = datetime.datetime.now()
    for filename in os.listdir(workdir):
        stems = get_stems_processed(os.path.join(workdir, filename))
        all_documents.append(stems)
        counts = count_words_in_sentences(stems)
        for w in counts.keys():
            if w not in word_dict_raw.keys():
                word_dict_raw[w] = 0
            word_dict_raw[w] += counts[w]
        cntr += 1
        if cntr % 1000 == 0:
            print(f"Processed {cntr} files. 1000 files per: {(datetime.datetime.now() - start_time).total_seconds()}s")
            start_time = datetime.datetime.now()

In [None]:
word_dict = dict(word_dict_raw)

for w in word_dict_raw.keys():
    if word_dict_raw[w] < 3:
        del word_dict[w]

In [None]:
if not os.path.isdir(result_dir):
    os.makedirs(result_dir, exist_ok=True)
with open(os.path.join(result_dir, "dictionary.json"), "w") as f:
    json.dump(word_dict, f, indent=1)

In [None]:
doc_names = []
matrix_arr = []
cnt = 0
for t in topics:
    workdir = os.path.join(train_dir, t)
    start_time = datetime.datetime.now()
    for filename in os.listdir(workdir):
        doc_names.append(t + "/" + filename)
        stems = get_stems_processed(os.path.join(workdir, filename))
        counts = count_words_in_sentences(stems)
        vec = []
        for w in word_dict.keys():
            if w in counts.keys():
                vec.append(counts[w])
            else:
                vec.append(0)
        matrix_arr.append(vec)
        if sum(vec) == 0:
            print("Zero vector for document", filename)
        cnt += 1
        if cnt % 1000 == 0:
            print(f"Processed {cnt} files. 1000 Files Per: {(datetime.datetime.now() - start_time).total_seconds()}s")
            start_time = datetime.datetime.now()


In [None]:
m_len = len(matrix_arr)
m_1 = pd.DataFrame(matrix_arr[:int(m_len/4)])
m_1.columns = word_dict.keys()
m_1.index = doc_names[:int(m_len/4)]

In [None]:
m_1.to_csv(os.path.join(result_dir, "term-document.csv"))

In [None]:
m_len = len(matrix_arr)
m_1 = pd.DataFrame(matrix_arr[int(3*m_len/4):])
m_1.columns = word_dict.keys()
m_1.index = doc_names[int(3*m_len/4):]
m_1.to_csv(os.path.join(result_dir, "term-document.csv"), mode="a", header=False)

In [None]:
matrix = pd.DataFrame(matrix_arr)
matrix.columns = word_dict.keys()
matrix.index = doc_names

In [None]:
matrix.to_csv(os.path.join(result_dir, "term-document.csv"))

# Task 2

In [None]:
def tf_idf(words, matrix):
    total_words = sum(words.values())
    total_documents = len(matrix.index)
    result = []
    for w in matrix.columns:
        if w not in words:
            result.append(0.0)
            continue
        t_f = words[w] / total_words
        d_f = sum(matrix[w] > 0)
        tfidf = t_f * (log10(total_documents + 1) - log10(d_f + 1))
        result.append(tfidf)
    return result

In [None]:
import os
import re
from nltk import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from pathlib import Path

In [None]:
def split_to_words(sentence):
    words = re.findall(r"\w+@\w+\.\w+|\+\d{1,3}-\d{3}-\d{3}-\d{2}-\d{2}|\w+", sentence)
    return words

In [None]:
def split_to_sent(text):
    text = re.sub(r"(?<=&lt;).*?(?=&gt;)", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    sentences = re.split(
        r"(((?<!\w\.\w.)(?<!\s\w\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s(?=[A-Z]))|((?<![\,\-\:])\n(?=[A-Z]|\" )))", text)[
                ::4]
    return sentences

In [None]:
def preprocess_text(text, by_sentences=False):
    stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()
    sentences = split_to_sent(text)
    result = []
    for s in sentences:
        sentence = []
        for w in split_to_words(s):
            w_processed = re.sub(r"[.!?,]$", "", w).lower()
            if lemmatizer.lemmatize(w_processed) not in stopwords.words("english"):
                sentence.append(stemmer.stem(w_processed))
        if by_sentences:
            result.append(sentence)
        else:
            result += sentence
    return result

In [None]:
def vectorize_tf_idf(text, matrix):
    preprocessed = preprocess_text(text)
    text_dict = count_words_in_sentences([preprocessed])
    return tf_idf(text_dict, matrix)

In [None]:
matrix = pd.read_csv(os.path.join(result_dir, "term-document.csv"), index_col=0)

In [None]:
t1 = """Arab forces carried out a terrorist attack on a US military base in Iraq"""

In [None]:
t2 = """Oil prices rose due to tensions in the Middle East region"""

In [None]:
vectorize_tf_idf(t1, matrix)[:100]

In [None]:
vectorize_tf_idf(t2, matrix)[:100]

# Task 3

In [None]:
w2v = Word2Vec(sentences=[sentence for document in all_documents for sentence in document], epochs=40)
w2v.save(os.path.join(train_dir, "..", "w2v_weights"))

In [None]:
w2v.wv.most_similar("Iraq")

# Task 4

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
cosine_similarity(w2v.wv["politician"], w2v.wv["Iraq"])

In [None]:
def draw_words(terms, vectors_source):
    pca = PCA(n_components=2)
    vectors_2d = pd.DataFrame(pca.fit_transform([vectors_source[term] for term in terms]))
    vectors_2d.index = terms
    vectors_2d.columns = ["x", "y"]
    p = sns.scatterplot(data=vectors_2d, x="x", y="y")

    for i in vectors_2d.index:
        item = vectors_2d.loc[i]
        p.text(item.x, item.y, i)
    return p

In [None]:
terms_to_check = ["Iraq", "time", "world", "true", "money", "oil",
                  "wrong", "human", "person", "tell", "see", "opinion", "think", "view", "religion", "muslim", "islam",]
draw_words(terms_to_check, w2v.wv)

# Task 5

In [None]:
def transform_to_compare(vectors):
    pca = PCA(n_components=len(w2v.wv[0]))
    transformed = pca.fit_transform(vectors)
    return transformed

In [None]:
terms_vectorized = [vectorize_tf_idf(i, matrix) for i in matrix.columns]

# Task 6

In [None]:
to_cmp = transform_to_compare(terms_vectorized)

In [None]:
terms_to_compare = pd.DataFrame(to_cmp)
terms_to_compare.index = matrix.columns

In [None]:
def compare_methods(w1, w2):
    print("Results for words", w1, "and", w2)
    print("W2V:", cosine_similarity(w2v.wv[w1], w2v.wv[w2]))
    print("Tf-Idf:", cosine_similarity(terms_to_compare.loc[w1], terms_to_compare.loc[w2]))
    print()

In [None]:
compare_methods("time", "muslim")
compare_methods("say", "tell")
compare_methods("person", "time")

In [None]:
tfidf_data = {}
for i in range(len(matrix.columns)):
    tfidf_data[matrix.columns[i]] = terms_vectorized[i]

In [None]:
draw_words(terms_to_check, tfidf_data)

# Task 7

In [None]:
def vectorize(sentences, w2v):
    result_vec = np.zeros(w2v.vector_size)
    for s in sentences:
        sentence_vec = np.zeros(w2v.vector_size)
        for w in s:
            if w2v.wv.has_index_for(w):
                sentence_vec += w2v.wv[w]
        sentence_vec = sentence_vec / len(s) if len(s) > 0 else np.zeros(w2v.vector_size)
        result_vec += sentence_vec
    result_vec = result_vec / len(sentences) if len(sentences) > 0 else np.zeros(w2v.vector_size)
    return result_vec

In [None]:
preprocess_text(t1, True)

In [None]:
vectorize(preprocess_text(t1, True), w2v)

 # Task 8

In [None]:
w2v = Word2Vec.load(os.path.join(train_dir, "..", "w2v_weights"))
vectorized_documents = {}
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        stems = get_stems_processed(os.path.join(workdir, filename))
        vectorized_documents[os.path.join(t, filename)] = vectorize(stems, w2v)

In [None]:
with open(os.path.join(train_dir, "..", "train_embeddings.tsv"), "w") as f:
    for k in vectorized_documents.keys():
        print(k.replace(".tsv", ""), *vectorized_documents[k], sep="\t", file=f)

In [None]:
test_dir = os.path.realpath("../assets/annotated-corpus/test")
vectorized_documents = {}
for t in topics:
    workdir = os.path.join(test_dir, t)
    for filename in os.listdir(workdir):
        stems = get_stems_processed(os.path.join(workdir, filename))
        vectorized_documents[os.path.join(t, filename)] = vectorize(stems, w2v)

In [None]:
with open(os.path.join(train_dir, "..", "test_embeddings.tsv"), "w") as f:
    for k in vectorized_documents.keys():
        print(k.replace(".tsv", ""), *vectorized_documents[k], sep="\t", file=f)