In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
plots = []
genres = []

with open("train_data.txt", "r", encoding="utf-8", errors="ignore") as file:
    for line in file:
        parts = line.strip().split(" ::: ")
        if len(parts) >= 4:
            genre = parts[2]
            plot = parts[3]
            genres.append(genre)
            plots.append(plot)

print("Number of samples:", len(plots))
print("First genre:", genres[0])
print("First plot:", plots[0][:200])  # first 200 characters


Number of samples: 54214
First genre: drama
First plot: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone


In [3]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = tfidf.fit_transform(plots)

print("TF-IDF shape:", X.shape)


TF-IDF shape: (54214, 5000)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    genres,
    test_size=0.2,
    random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 43371
Testing samples: 10843


In [5]:
model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

model.fit(X_train, y_train)

print("Model training completed ✅")


Model training completed ✅


In [6]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.5794521811306834

Classification Report:

              precision    recall  f1-score   support

      action       0.52      0.25      0.34       263
       adult       0.75      0.21      0.33       112
   adventure       0.42      0.14      0.21       139
   animation       0.62      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.59      0.55      1443
       crime       0.29      0.02      0.04       107
 documentary       0.66      0.84      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.39      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.64      0.56      0.60       431
       music       0.62      0.47      0.53       144
     musical       1.00      0.02      0.04        50
     mystery       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
def predict_genre(plot_text):
    vector = tfidf.transform([plot_text])
    prediction = model.predict(vector)
    return prediction[0]

# Example prediction
sample_plot = """
A group of friends travel to a remote village where strange
events begin to occur at night and a dark secret is revealed.
"""

print("Predicted Genre:", predict_genre(sample_plot))


Predicted Genre: horror


In [11]:
def predict_genre(plot_text):
    vector = tfidf.transform([plot_text])
    prediction = model.predict(vector)
    return prediction[0]

# Test with your own story
test_plot = """
A brave soldier risks his life on the battlefield to protect his country
while struggling with loss, loyalty, and sacrifice.
"""

print("Predicted Genre:", predict_genre(test_plot))


Predicted Genre: drama


In [14]:
def predict_indian_movie(plot_english):
    vector = tfidf.transform([plot_english])
    prediction = model.predict(vector)
    return prediction[0]


# Example: Indian movie (English plot)
indian_movie_plot = """
A young farmer fights against a corrupt political system
to protect his village and family while discovering his inner strength.
"""

print("Predicted Genre:", predict_indian_movie(indian_movie_plot))


Predicted Genre: drama
