In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

In [None]:
plt.figure(figsize=(5, 3))
df["label"].value_counts().plot(kind="barh", color="green")
plt.show()

In [4]:
def split_df(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_size / (val_size + test_size), random_state=random_state)
    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = split_df(df)
print(train_df.shape, val_df.shape, test_df.shape)

In [6]:
vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train_df["text"])
X_val = vectorizer.transform(val_df["text"])
X_test = vectorizer.transform(test_df["text"])

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_df["label"])

In [None]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(val_df["label"], val_preds)
macro_f1 = f1_score(val_df["label"], val_preds, average="macro")
micro_f1 = f1_score(val_df["label"], val_preds, average="micro")

print("The results on the validation set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")

In [None]:
test_preds = model.predict(X_test)
accuracy = accuracy_score(test_df["label"], test_preds)
macro_f1 = f1_score(test_df["label"], test_preds, average="macro")
micro_f1 = f1_score(test_df["label"], test_preds, average="micro")

print("The results on the test set are:")
print(f"Accuracy Score: {accuracy}")
print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")