In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")


file_path = '/kaggle/input/genre-classification-dataset/train_data.txt'

data = pd.read_csv(file_path, sep='::: ', engine='python', names=['id', 'tittle', 'genre', 'plot_summary'])

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

print("Cleaning plot summaries...")
data['cleaned_plot'] = [clean_text(text) for text in tqdm(data['plot_summary'], desc="Progress")]

texts = data['cleaned_plot']
labels = data['genre']

print("Genre Distribution:")
print(labels.value_counts())


unique_labels, counts = np.unique(labels, return_counts=True)
total_samples = len(labels)
class_weights = {label: total_samples / (len(unique_labels) * count) for label, count in zip(unique_labels, counts)}

print("Class Weights:", class_weights)
print("Transforming text to TF-IDF features...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = tfidf.fit_transform(tqdm(texts, desc="TF-IDF Progress"))

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression

print("Training Logistic Regression model...")
model = LogisticRegression(
    max_iter=1000,
    class_weight=class_weights
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nLogistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

y_pred = model.predict(X_test)

print("\nLogistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

new_plot = input()

cleaned_plot = re.sub(r'[^a-zA-Z\s]', '', new_plot.lower())
tokens = word_tokenize(cleaned_plot)
tokens = [word for word in tokens if word not in stop_words]
plot_tfidf = tfidf.transform([' '.join(tokens)])

predicted_genre = model.predict(plot_tfidf)
print(f"Predicted Genre: {predicted_genre[0]}")


Cleaning plot summaries...


Progress: 100%|██████████| 54214/54214 [00:21<00:00, 2523.20it/s]


Genre Distribution:
genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64
Class Weights: {'action ': 1.5269398676242782, 'adult ': 3.403264281230383, 'adventure ': 2.590872162485066, 'animation ': 4.031979770935594, 'biography ': 7.577078965758211, 'comedy ': 0.26962883388289594, 'crime ': 3.976090942427576, 'documentary ': 0.15332360460643907, 'drama ': 0.1475006189617223, 'family ': 2.561130007558579, 'fantasy ': 6.2164889

TF-IDF Progress: 100%|██████████| 54214/54214 [00:07<00:00, 7676.33it/s]


Training Logistic Regression model...

Logistic Regression Results:
Accuracy: 0.4814
              precision    recall  f1-score   support

     action        0.34      0.46      0.39       263
      adult        0.34      0.65      0.45       112
  adventure        0.18      0.33      0.23       139
  animation        0.19      0.25      0.21       104
  biography        0.04      0.08      0.05        61
     comedy        0.60      0.47      0.53      1443
      crime        0.13      0.32      0.18       107
documentary        0.81      0.59      0.68      2659
      drama        0.72      0.40      0.52      2697
     family        0.14      0.31      0.19       150
    fantasy        0.11      0.16      0.13        74
  game-show        0.67      0.75      0.71        40
    history        0.05      0.20      0.08        45
     horror        0.55      0.69      0.61       431
      music        0.38      0.75      0.50       144
    musical        0.14      0.34      0.20       

 lions story to becoming king 


Predicted Genre: fantasy 
