### Here we train Word2Vec on AGNews text data

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")
import warnings
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
ag = pd.read_csv("../HomeWork1/ag.csv")
print(ag.shape)
ag.head()

(90000, 1)


Unnamed: 0,text
0,wall st. bears claw back into the black (reute...
1,carlyle looks toward commercial aerospace (reu...
2,oil and economy cloud stocks' outlook (reuters...
3,iraq halts oil exports from main southern pipe...
4,"oil prices soar to all-time record, posing new..."


In [3]:
sentences = [word_tokenize(text) for text in ag["text"].to_list()]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, workers=4)
print(word2vec_model)
print(word2vec_model.wv["the"])

Word2Vec<vocab=25389, vector_size=100, alpha=0.025>
[-1.0185182   1.0597867  -0.5869825   2.1043217  -0.32311937  0.7042859
  1.0434008  -1.2307667   1.6345253  -0.8659341   0.30565715  0.79652405
  1.0993587  -0.7523909  -2.159914    0.85680795 -0.0672893   1.5678796
  1.3625771  -0.4529641   1.1533221   0.7091411   0.7207622   0.13700038
  0.22972478  0.32556632 -1.4616694   1.3461691  -0.6263655   1.9706436
  1.074446   -0.18022203 -0.30738994 -0.43006128  0.22188076 -0.22365646
  0.1891554   0.81460845  0.12415149  1.7362758   0.0152154  -0.12946442
 -0.59924644  0.7598288  -0.4991222   0.97637343 -1.7476748  -0.6420553
 -0.6739624  -0.3259385  -0.17657325 -1.0000002  -1.2420468   0.8297787
  0.09147941  1.4425206  -0.58668214  0.61572415 -2.1666646  -0.1067452
 -0.47582078  0.7702358  -0.17053765 -1.8544205   2.0656345   1.1561612
  2.6926045   0.74130136  1.8225517   1.5557338  -0.88574797 -2.1144047
 -0.2084011   2.2124333   0.403437   -0.874645    0.6349529  -0.63600904
  0.366

In [4]:
def get_text_vector(text, model=word2vec_model):
    words = word_tokenize(text.lower())
    word_vectors = []
    for word in words:
        if word.lower() in model.wv.key_to_index:
            word_vectors.append(model.wv[word.lower()])
        else:
            continue
    return np.mean(word_vectors, axis=0)

example_output = get_text_vector(text="The quick brown fox jumps over the lazy dog")
print(example_output)

[-6.50715053e-01  7.26831496e-01  2.08814610e-02  2.93653667e-01
 -1.62549078e-01 -2.08014667e-01  2.83165038e-01 -3.34622294e-01
  7.21142292e-01 -1.38007373e-01  4.31619704e-01 -3.42079908e-01
  4.16156083e-01 -4.75730211e-01 -9.04971838e-01 -2.41975524e-02
  2.08167404e-01  3.05923730e-01  3.20417225e-01 -3.48953992e-01
  5.94729066e-01  1.01348072e-01  2.42377728e-01 -1.10951923e-02
 -1.35424674e-01  3.08533877e-01 -1.89357042e-01  4.02220935e-01
  1.97609831e-02  2.81636745e-01  3.65686208e-01 -1.54718459e-01
 -2.10146829e-01 -5.04583895e-01 -6.54898211e-02  3.53946686e-01
  1.90306738e-01  5.22515297e-01 -7.29097426e-02  3.98994118e-01
 -1.51943974e-02  1.83699563e-01 -2.95269519e-01  9.48270597e-03
  1.82910413e-01 -2.44965866e-01 -8.83262277e-01 -5.85051537e-01
 -2.57507801e-01 -3.11714143e-01 -3.59205991e-01 -2.97075927e-01
 -1.66084319e-01  4.74583685e-01  3.07300270e-01  6.09182239e-01
 -3.00029427e-01  1.16512865e-01 -6.37081981e-01  8.87649506e-02
 -3.05698425e-01 -1.41662

In [5]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

(11519, 2)


Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [6]:
def split_df(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_size / (val_size + test_size), random_state=random_state)
    return train_df, val_df, test_df

In [7]:
train_df, val_df, test_df = split_df(df)
print(train_df.shape, val_df.shape, test_df.shape)

(9215, 2) (1152, 2) (1152, 2)


In [8]:
X_train = np.array([get_text_vector(text) for text in tqdm(train_df["text"].to_list(), total=len(train_df))])
X_val = np.array([get_text_vector(text) for text in tqdm(val_df["text"].to_list(), total=len(val_df))])
X_test = np.array([get_text_vector(text) for text in tqdm(test_df["text"].to_list(), total=len(test_df))])
print(X_train.shape, X_val.shape, X_test.shape)

100%|█████████████████████████████████████████████████████████████████████████████| 9215/9215 [00:51<00:00, 180.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1152/1152 [00:05<00:00, 200.93it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1152/1152 [00:05<00:00, 198.56it/s]

(9215, 100) (1152, 100) (1152, 100)





In [9]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_df["label"])

In [10]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(val_df["label"], val_preds)
macro_f1 = f1_score(val_df["label"], val_preds, average="macro")
micro_f1 = f1_score(val_df["label"], val_preds, average="micro")

print("The results on the validation set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

The results on the validation set are:
Accuracy Score: 0.9730902777777778
Macro F1-Score: 0.9352031637469408
Micro F1-Score: 0.9730902777777778


In [11]:
test_preds = model.predict(X_test)
accuracy = accuracy_score(test_df["label"], test_preds)
macro_f1 = f1_score(test_df["label"], test_preds, average="macro")
micro_f1 = f1_score(test_df["label"], test_preds, average="micro")

print("The results on the test set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

The results on the test set are:
Accuracy Score: 0.9713541666666666
Macro F1-Score: 0.9288212871380201
Micro F1-Score: 0.9713541666666666
