In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# ✅ Load the training and test data
train_data = pd.read_csv("train_data.txt", sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
test_data = pd.read_csv("test_data.txt", sep=':::', names=['Title', 'Description'], engine='python')
test_solution = pd.read_csv("test_data_solution.txt", sep=':::', names=['Title', 'Genre'], engine='python')

# ✅ Encode genre labels
label_encoder = LabelEncoder()
# Handle NaN values in 'Genre' columns before fitting the LabelEncoder
# Replace NaN values with a placeholder string (e.g., 'unknown')
train_data["Genre"] = train_data["Genre"].astype(str).fillna('unknown')
test_solution["Genre"] = test_solution["Genre"].astype(str).fillna('unknown')

# ✅ Fit LabelEncoder on all genres from train and test solutions
all_genres = pd.concat([train_data["Genre"], test_solution["Genre"]]).unique()
label_encoder.fit(all_genres)

train_data["Genre"] = label_encoder.transform(train_data["Genre"])
test_solution["Genre"] = label_encoder.transform(test_solution["Genre"])

# ✅ Split training data for validation
X_train, X_test, y_train, y_test = train_test_split(train_data["Description"], train_data["Genre"], test_size=0.2, random_state=42)

# ✅ TF-IDF Vectorization (fit only on training data)
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)

# Fit the TF-IDF vectorizer on the training data only
tfidf.fit(X_train)

# Transform the training and testing data
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
X_final_test_tfidf = tfidf.transform(test_data["Description"])  # Test data transformation


# ✅ Train Naïve Bayes Model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# ✅ Validate Model on Split Test Data
y_pred = model.predict(X_test_tfidf)
print("Validation Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ✅ Predict on Full Test Data (`test_data.txt`)
y_final_pred = model.predict(X_final_test_tfidf)
test_data["Predicted_Genre"] = label_encoder.inverse_transform(y_final_pred)

# ✅ Save predictions to file
test_data.to_csv("predicted_genres.csv", index=False)
print("Predictions saved to predicted_genres.csv")

# ✅ Evaluate on actual test solutions (`test_data_solution.txt`)
merged_df = test_data.merge(test_solution, on="Title", how="left", suffixes=("_Predicted", ""))

# Instead of using inverse_transform directly on potentially unseen labels,
# we transform the 'Predicted_Genre' to numerical labels using the same label_encoder
# and then calculate the accuracy_score with 'Genre_Actual' which already contains numerical labels.
merged_df['Predicted_Genre_Numeric'] = label_encoder.transform(merged_df["Predicted_Genre"])

#  ✅ Drop rows with NaN values in 'Genre' before calculating accuracy
merged_df = merged_df.dropna(subset=['Genre'])

# Calculate accuracy using the numerical labels
accuracy_on_test = accuracy_score(merged_df["Genre"], merged_df['Predicted_Genre_Numeric'])
print(f"Accuracy on test_data.txt: {accuracy_on_test:.2%}")

Validation Accuracy: 0.5231946878170248
Classification Report:
               precision    recall  f1-score   support

       53979       0.58      0.08      0.14       263
       53980       0.88      0.06      0.12       112
       53981       0.29      0.03      0.05       139
       53988       0.00      0.00      0.00       104
       53991       0.00      0.00      0.00        61
       53993       0.51      0.44      0.47      1443
       53994       0.00      0.00      0.00       107
       53997       0.58      0.88      0.70      2659
       53999       0.46      0.83      0.59      2697
       54002       1.00      0.01      0.01       150
       54004       0.00      0.00      0.00        74
       54010       1.00      0.15      0.26        40
       54019       0.00      0.00      0.00        45
       54020       0.73      0.36      0.48       431
       54033       0.77      0.12      0.20       144
       54034       0.00      0.00      0.00        50
       54037     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Predictions saved to predicted_genres.csv
Accuracy on test_data.txt: nan%


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [None]:
def predict_genre(movie_description):
    # ✅ Convert the description into TF-IDF format
    movie_tfidf = tfidf.transform([movie_description])

    # ✅ Predict genre
    predicted_label = model.predict(movie_tfidf)[0]

    # ✅ Convert the numeric label back to the genre name
    predicted_genre = label_encoder.inverse_transform([predicted_label])[0]

    return predicted_genre

# Example Usage:
movie_description = "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."
predicted_genre = predict_genre(movie_description)
print(f"Predicted Genre: {predicted_genre}")


Predicted Genre:  documentary 
