In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
warnings.filterwarnings("ignore")

In [None]:
with open("./glove.6B.100d.txt", "r", encoding="utf-8") as f:
    for line in f:
        print(line)
        print(len(line.split()))
        break

In [None]:
def word_to_vec(file_path):
    word_to_vec_dict = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            word = line.split()[0]
            vec = np.asarray(line.split()[1:], dtype=np.float32)
            word_to_vec_dict[word] = vec
    return word_to_vec_dict

word_to_vec_dict = word_to_vec(file_path="./glove.6B.100d.txt")

In [None]:
def get_text_vector(text, word_dict=word_to_vec_dict):
    words = word_tokenize(text.lower())
    word_vectors = []
    for word in words:
        if word.lower() in word_dict:
            word_vectors.append(word_dict[word.lower()])
        else:
            continue
    return np.mean(word_vectors, axis=0)

example_output = get_text_vector(text="The quick brown fox jumps over the lazy dog")
print(example_output)

In [None]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

In [6]:
def split_df(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_size / (val_size + test_size), random_state=random_state)
    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = split_df(df)
print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
X_train = np.array([get_text_vector(text, word_to_vec_dict) for text in tqdm(train_df["text"].to_list(), total=len(train_df))])
X_val = np.array([get_text_vector(text, word_to_vec_dict) for text in tqdm(val_df["text"].to_list(), total=len(val_df))])
X_test = np.array([get_text_vector(text, word_to_vec_dict) for text in tqdm(test_df["text"].to_list(), total=len(test_df))])
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_df["label"])

In [None]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(val_df["label"], val_preds)
macro_f1 = f1_score(val_df["label"], val_preds, average="macro")
micro_f1 = f1_score(val_df["label"], val_preds, average="micro")

print("The results on the validation set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

In [None]:
test_preds = model.predict(X_test)
accuracy = accuracy_score(test_df["label"], test_preds)
macro_f1 = f1_score(test_df["label"], test_preds, average="macro")
micro_f1 = f1_score(test_df["label"], test_preds, average="micro")

print("The results on the test set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")