### Here we train Word2Vec on NYT text data

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")
import warnings
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

(11519, 2)


Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [3]:
def split_df(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_size / (val_size + test_size), random_state=random_state)
    return train_df, val_df, test_df

In [4]:
train_df, val_df, test_df = split_df(df)
print(train_df.shape, val_df.shape, test_df.shape)

(9215, 2) (1152, 2) (1152, 2)


In [5]:
sentences = [word_tokenize(text) for text in df["text"].to_list()]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, workers=4)
print(word2vec_model)
print(word2vec_model.wv["the"])

Word2Vec<vocab=35508, vector_size=100, alpha=0.025>
[-0.98873585  0.49433428  0.40309918 -1.2855661  -0.34429535 -1.2613112
 -0.59216666  0.71640193  1.2102641   0.07631939 -1.0763141   1.4029491
  0.08356473  0.41038522 -0.2248113  -0.15321697 -1.6010284   1.478717
  0.43298462  0.8611422  -0.5972456  -1.0075772   0.7550502  -1.7212527
 -0.56475    -0.713476    0.11836907  0.28333202 -0.19581261 -1.1341335
 -2.315132   -0.4860049  -0.1890143  -0.5286839  -0.84020776 -1.7614639
 -0.4680763  -0.46225917 -1.0978154   0.45884764 -1.4882401   2.3127851
 -1.6026542  -1.3228048  -0.7439097  -1.3139895  -2.1649127   0.7282601
 -1.779592    0.82029796 -1.7768297   0.2617958  -1.6438801  -0.2783948
  0.13875754  0.555435   -0.10405042  0.10079765  1.122897    1.0319557
 -2.394965    1.2388946  -0.4605088   0.8228644  -1.0701007  -0.1431481
  0.589052    0.9460849   0.6271815  -0.4243667   0.20020293  0.39818084
 -1.0054387  -1.2314249   0.78806126  0.18573251  0.48054144 -0.6443031
  0.636108  

In [6]:
print(word_tokenize("My name is Harold, his name is Eric.".lower()))
print("My name is Harold, his name is Eric.".lower().split())

['my', 'name', 'is', 'harold', ',', 'his', 'name', 'is', 'eric', '.']
['my', 'name', 'is', 'harold,', 'his', 'name', 'is', 'eric.']


In [7]:
def get_text_vector(text, model=word2vec_model):
    words = word_tokenize(text.lower())
    word_vectors = []
    for word in words:
        if word.lower() in model.wv.key_to_index:
            word_vectors.append(model.wv[word.lower()])
        else:
            continue
    return np.mean(word_vectors, axis=0)

example_output = get_text_vector(text="The quick brown fox jumps over the lazy dog")
print(example_output)

[-0.31371582  0.68195236 -0.05648704 -0.45202738 -0.38125566 -0.09965655
 -0.58942425  0.3481      0.18919852 -0.14432994 -0.18112245  0.48153818
  0.1226997   0.3949361  -0.13666695  0.2023481  -0.52558947  0.29105726
  0.73829573 -0.07363337  0.00397359 -0.22169575  0.1694851  -0.9146741
 -0.10271254  0.10780057 -0.34557164  0.1663874  -0.08909501 -0.48319256
 -0.28389338 -0.07607589 -0.04169292  0.20866835  0.05320869 -0.2786562
 -0.37283635 -0.21385774 -0.59960103 -0.02219326 -0.36592335  0.78262734
 -0.7231324  -0.33232576 -0.0500223  -0.02275046 -0.2770918   0.03521188
 -0.88551724 -0.00911321 -0.10779153 -0.06084993 -0.5034465  -0.39745152
  0.2779511   0.01099059 -0.38512647 -0.14703695  0.3232696   0.34821028
 -0.75181365  0.7619026  -0.2039565   0.6900088  -0.3080208  -0.10369025
  0.64018387  0.4010274  -0.26243737  0.40674508  0.01597267  0.03060122
 -0.4100045  -0.27370414  0.04224857  0.5678412   0.34098825 -0.37036216
  0.02192982 -0.40252602  0.29577386  0.00398679  0.3

In [8]:
X_train = np.array([get_text_vector(text) for text in tqdm(train_df["text"].to_list(), total=len(train_df))])
X_val = np.array([get_text_vector(text) for text in tqdm(val_df["text"].to_list(), total=len(val_df))])
X_test = np.array([get_text_vector(text) for text in tqdm(test_df["text"].to_list(), total=len(test_df))])
print(X_train.shape, X_val.shape, X_test.shape)

100%|█████████████████████████████████████████████████████████████████████████████| 9215/9215 [00:43<00:00, 210.11it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1152/1152 [00:05<00:00, 213.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1152/1152 [00:05<00:00, 206.58it/s]

(9215, 100) (1152, 100) (1152, 100)





In [9]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_df["label"])

In [10]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(val_df["label"], val_preds)
macro_f1 = f1_score(val_df["label"], val_preds, average="macro")
micro_f1 = f1_score(val_df["label"], val_preds, average="micro")

print("The results on the validation set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

The results on the validation set are:
Accuracy Score: 0.9774305555555556
Macro F1-Score: 0.9441482885197297
Micro F1-Score: 0.9774305555555556


In [11]:
test_preds = model.predict(X_test)
accuracy = accuracy_score(test_df["label"], test_preds)
macro_f1 = f1_score(test_df["label"], test_preds, average="macro")
micro_f1 = f1_score(test_df["label"], test_preds, average="micro")

print("The results on the test set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

The results on the test set are:
Accuracy Score: 0.9765625
Macro F1-Score: 0.938796461403518
Micro F1-Score: 0.9765625


### When comparing the three groups of results through trained embeddings with naive vectorizer results, we can find that using naive vectorizers perform better here.

**What are the disadvantages of averaging word vectors for the document representation? Describe an idea to overcome this. The document vectors should be formed using word vectors.**

(i) Averaging word vectors does not consider the importance of different words in a single text. For example, the word 'the' is usually less important than the word 'cat'. We can assign a weight to each word vector, perhaps according to the frequency of the word in the text.

(ii) Averaging word vectors does not consider the position of different words in the text. For example, the word 'cat' is more important in the sentence 'The cat is on the mat' than in the sentence 'The mat is on the cat'. We can assign a positional embedding to each word based on its position.