In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

TRAIN_FILE = "train_data.txt"
TEST_FILE = "test_data.txt"
TEST_SOLUTION_FILE = "test_data_solution.txt"
DELIMITER = " ::: "

def load_data(filepath, is_training=True):
    if is_training:
        cols = ["ID", "TITLE", "GENRE", "DESCRIPTION"]
    else:
        cols = ["ID", "TITLE", "DESCRIPTION"]
    df = pd.read_csv(filepath, sep=DELIMITER, names=cols, engine="python")
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = df[c].str.strip()
    df["DESCRIPTION"] = df["DESCRIPTION"].fillna("")
    df["TITLE"] = df["TITLE"].fillna("")
    return df

def clean_text(text: str) -> str:
    text = re.sub(r"<.*?>", "", str(text))
    text = re.sub(r"[^a-zA-Z\s]", " ", text).lower()
    return text

def run_genre_predictor():
    print("Loading data...")
    try:
        train_df = load_data(TRAIN_FILE, is_training=True)
    except FileNotFoundError:
        print(f"Error: {TRAIN_FILE} not found.")
        return
    try:
        test_df = load_data(TEST_FILE, is_training=False)
    except FileNotFoundError:
        print(f"Error: {TEST_FILE} not found.")
        return
    try:
        test_solution_df = load_data(TEST_SOLUTION_FILE, is_training=True)[["ID", "GENRE"]]
    except FileNotFoundError:
        print(f"Warning: {TEST_SOLUTION_FILE} not found. Skipping evaluation.")
        test_solution_df = None

    print("Building features...")
    train_df["TEXT_FEATURE"] = (
        train_df["TITLE"].apply(clean_text) + " " + train_df["DESCRIPTION"].apply(clean_text)
    )
    test_df["TEXT_FEATURE"] = (
        test_df["TITLE"].apply(clean_text) + " " + test_df["DESCRIPTION"].apply(clean_text)
    )

    X_train = train_df["TEXT_FEATURE"]
    y_train = train_df["GENRE"]

    print("Vectorizing...")
    tfidf = TfidfVectorizer(stop_words="english", max_features=50000, ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(test_df["TEXT_FEATURE"])
    print(f"Train shape: {X_train_tfidf.shape} | Test shape: {X_test_tfidf.shape}")

    print("Training...")
    clf = OneVsRestClassifier(LogisticRegression(solver="liblinear", random_state=42, C=10))
    clf.fit(X_train_tfidf, y_train)

    print("Predicting...")
    y_pred = clf.predict(X_test_tfidf)
    test_df["PREDICTED_GENRE"] = y_pred

    if test_solution_df is not None:
        evaluation_df = pd.merge(test_df, test_solution_df, on="ID", how="inner")
        y_true = evaluation_df["GENRE"]
        y_predicted = evaluation_df["PREDICTED_GENRE"]

        acc = accuracy_score(y_true, y_predicted)
        print("\nEvaluation")
        print("-" * 40)
        print(f"Accuracy: {acc:.4f}\n")
        print("Classification report:")
        print(classification_report(y_true, y_predicted))

        print("Confusion matrix:")
        genres = sorted(y_true.unique())
        cm = confusion_matrix(y_true, y_predicted, labels=genres)
        cm_df = pd.DataFrame(cm, index=[f"True: {g}" for g in genres],
                                columns=[f"Pred: {g}" for g in genres])
        print(cm_df.to_string())

        print("\nSamples:")
        print(evaluation_df[["TITLE", "GENRE", "PREDICTED_GENRE"]].head(20).to_markdown(index=False))
    else:
        print("\nFirst predictions:")
        print(test_df[["ID", "TITLE", "DESCRIPTION", "PREDICTED_GENRE"]].head(10).to_markdown(index=False))

if __name__ == "__main__":
    run_genre_predictor()


Loading data...
Building features...
Vectorizing...
Train shape: (54214, 50000) | Test shape: (54200, 50000)
Training...
Predicting...

Evaluation
----------------------------------------
Accuracy: 0.5962

Classification report:
              precision    recall  f1-score   support

      action       0.48      0.35      0.41      1314
       adult       0.67      0.37      0.48       590
   adventure       0.55      0.21      0.31       775
   animation       0.49      0.11      0.19       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.60      0.57      7446
       crime       0.33      0.07      0.11       505
 documentary       0.69      0.84      0.76     13096
       drama       0.56      0.75      0.64     13612
      family       0.49      0.16      0.24       783
     fantasy       0.41      0.08      0.14       322
   game-show       0.85      0.59      0.70       193
     history       0.50      0.02      0.04       243
      horror  