In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm # Import tqdm for progress bars

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')


def load_and_clean_data(file_path):
    """
    Loads data from a specified file, cleans the text, and returns cleaned plots and genres.
    """
    data = pd.read_csv(file_path, sep='::: ', engine='python', names=['id', 'tittle', 'genre', 'plot_summary'])
    
    stop_words = set(stopwords.words('english'))
    def clean_text(text):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    print("Cleaning plot summaries...")
    data['cleaned_plot'] = [clean_text(text) for text in tqdm(data['plot_summary'], desc="Progress")]
    return data['cleaned_plot'], data['genre']


def compute_class_weights(labels):
    """
    Computes class weights to handle imbalanced datasets.
    """
    unique_labels, counts = np.unique(labels, return_counts=True)
    total_samples = len(labels)
    class_weights = {label: total_samples / (len(unique_labels) * count) for label, count in zip(unique_labels, counts)}
    return class_weights


def train_optimized_model(texts, labels):
    """
    Trains and optimizes a Logistic Regression model using GridSearchCV.
    """
    print("Transforming text to TF-IDF features...")
    tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = tfidf.fit_transform(tqdm(texts, desc="TF-IDF Progress"))

    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    class_weights = compute_class_weights(y_train)
    
    base_model = LogisticRegression(max_iter=1000, class_weight=class_weights)

    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'solver': ['lbfgs', 'liblinear']
    }
    print("Running GridSearchCV for hyperparameter tuning...")
    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)
    print("\nLogistic Regression Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

    return best_model, tfidf


def predict_genre(model, tfidf, new_plot):
    """
    Predicts the genre of a new plot summary.
    """
    stop_words = set(stopwords.words('english'))
    cleaned_plot = re.sub(r'[^a-zA-Z\s]', '', new_plot.lower())
    tokens = word_tokenize(cleaned_plot)
    tokens = [word for word in tokens if word not in stop_words]
    plot_tfidf = tfidf.transform([' '.join(tokens)])
    prediction = model.predict(plot_tfidf)
    return prediction[0]


if __name__ == "__main__":
    file_path = '/kaggle/input/genre-classification-dataset/train_data.txt' 

    print("Loading and preprocessing data...")
    texts, labels = load_and_clean_data(file_path)
    print("Genre Distribution:")
    print(labels.value_counts())

    print("\nTraining optimized Logistic Regression model...")
    model, tfidf = train_optimized_model(texts, labels)

    print("\nPredicting genre for a new plot...")
    new_plot = "A group of astronauts travel to Mars and uncover a hidden alien civilization."
    predicted_genre = predict_genre(model, tfidf, new_plot)
    print(f"Predicted Genre: {predicted_genre}")

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


Loading and preprocessing data...
Cleaning plot summaries...


Progress: 100%|██████████| 54214/54214 [00:20<00:00, 2704.17it/s]


Genre Distribution:
genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64

Training optimized Logistic Regression model...
Transforming text to TF-IDF features...


TF-IDF Progress: 100%|██████████| 54214/54214 [00:06<00:00, 8802.94it/s]


Running GridSearchCV for hyperparameter tuning...
Best Parameters: {'C': 10.0, 'solver': 'liblinear'}

Logistic Regression Results:
Accuracy: 0.5584
              precision    recall  f1-score   support

     action        0.34      0.46      0.39       263
      adult        0.52      0.54      0.53       112
  adventure        0.23      0.31      0.26       139
  animation        0.26      0.19      0.22       104
  biography        0.00      0.00      0.00        61
     comedy        0.56      0.56      0.56      1443
      crime        0.14      0.15      0.14       107
documentary        0.76      0.75      0.75      2659
      drama        0.65      0.58      0.61      2697
     family        0.19      0.25      0.22       150
    fantasy        0.10      0.07      0.08        74
  game-show        0.80      0.70      0.75        40
    history        0.06      0.02      0.03        45
     horror        0.58      0.71      0.64       431
      music        0.48      0.65      0