In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")
import warnings
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

(11519, 2)


Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [3]:
def split_df(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_size / (val_size + test_size), random_state=random_state)
    return train_df, val_df, test_df

In [4]:
train_df, val_df, test_df = split_df(df)
print(train_df.shape, val_df.shape, test_df.shape)

(9215, 2) (1152, 2) (1152, 2)


In [5]:
sentences = [word_tokenize(text) for text in df["text"].to_list()]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, workers=4)
print(word2vec_model)
print(word2vec_model.wv["the"])

Word2Vec<vocab=35508, vector_size=100, alpha=0.025>
[-1.0416162   0.18334866  0.12383539 -0.48050982 -0.7026071  -1.1532347
 -1.2419171   0.48496643  3.1954787  -0.4214874  -0.20388718  0.43471718
  0.28075793  0.9693965   0.86763453  0.3965662   0.06454255  1.7067114
 -0.13880144  0.5034888  -0.6254529  -0.04036564  0.19526967 -1.3510127
  1.5545214  -1.6280677   1.1216333   0.6142329  -1.2784927   1.3445048
 -1.5226399   0.42214915 -1.2378969  -0.39638892 -1.060673   -1.2770989
 -0.623593   -1.2700659  -0.60369855 -0.47306132 -0.7531487   1.4689833
 -1.9780884  -1.253274   -1.2021307  -0.485089   -0.6596709   1.1529626
 -0.3232115   0.5829019  -1.3443875   0.3394023  -1.2440597   0.29955727
  0.6060392   1.990405   -1.1163043  -0.01249888  0.06906155  0.12027698
 -0.8558933   0.01860613 -0.9359329   0.8235835   0.07558068  0.30800682
 -0.6168606   0.52370495  1.7437721  -0.58421487 -2.0887468   0.93057084
  0.01498543 -1.842769    0.3629927   1.4401654  -0.70933646  0.11659029
  0.03

In [6]:
print(word_tokenize("My name is Harold, his name is Eric.".lower()))
print("My name is Harold, his name is Eric.".lower().split())

['my', 'name', 'is', 'harold', ',', 'his', 'name', 'is', 'eric', '.']
['my', 'name', 'is', 'harold,', 'his', 'name', 'is', 'eric.']


In [7]:
def get_text_vector(text, model=word2vec_model):
    words = word_tokenize(text.lower())
    word_vectors = []
    for word in words:
        if word.lower() in model.wv.key_to_index:
            word_vectors.append(model.wv[word.lower()])
        else:
            continue
    return np.mean(word_vectors, axis=0)

example_output = get_text_vector(text="The quick brown fox jumps over the lazy dog")
print(example_output)

[-0.517228    0.56496876 -0.0986323   0.14618416 -0.19103324 -0.39312503
 -0.8727944   0.16834217  0.56237614  0.05324896  0.01504083  0.01773513
  0.24637514  0.4778852   0.06364122  0.19974099 -0.2374944   0.6111561
  0.54529995 -0.26977777 -0.20932794  0.13732143 -0.07225199 -0.747931
  0.3158088  -0.0655231  -0.24442571 -0.08174151 -0.13362762 -0.01768969
 -0.42705816  0.46687713 -0.3432363  -0.00335126 -0.04861318 -0.08115449
 -0.5698371  -0.30185637 -0.25638932 -0.186534   -0.4094302   0.7092454
 -0.71253896 -0.06514558 -0.05280926 -0.23434152 -0.24078602 -0.26695204
 -0.5088404   0.2785244  -0.10377121  0.2632388  -0.710811    0.24424052
  0.09027201  0.6791088  -0.57118154 -0.18673633 -0.06851691 -0.10005376
  0.11051986  0.24620014 -0.28959018  0.62808764  0.12944955 -0.10393164
  0.23868155  0.40514904  0.24102095  0.33251008 -0.77264965  0.31127632
 -0.14368619 -0.39738703 -0.2527594   0.816254    0.23013261 -0.32091644
  0.21725366  0.28445944 -0.44410545  0.3894466   0.786

In [8]:
X_train = np.array([get_text_vector(text) for text in tqdm(train_df["text"].to_list(), total=len(train_df))])
X_val = np.array([get_text_vector(text) for text in tqdm(val_df["text"].to_list(), total=len(val_df))])
X_test = np.array([get_text_vector(text) for text in tqdm(test_df["text"].to_list(), total=len(test_df))])
print(X_train.shape, X_val.shape, X_test.shape)

100%|██████████| 9215/9215 [00:45<00:00, 202.46it/s]
100%|██████████| 1152/1152 [00:05<00:00, 207.39it/s]
100%|██████████| 1152/1152 [00:05<00:00, 217.26it/s]

(9215, 100) (1152, 100) (1152, 100)





In [9]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_df["label"])

In [10]:
test_preds = model.predict(X_test)
accuracy = accuracy_score(test_df["label"], test_preds)
macro_f1 = f1_score(test_df["label"], test_preds, average="macro")
micro_f1 = f1_score(test_df["label"], test_preds, average="micro")

print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

Accuracy Score: 0.9791666666666666
Macro F1-Score: 0.9449580503303382
Micro F1-Score: 0.9791666666666666
