In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import csv

In [28]:
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography',
              'comedy','crime','documentary','family','fantasy',
              'game-show','history','horror','music','musical',
              "mystery",'news','reality-tv','romance','scifi',
              'sport','talk-show','thriller','war','western' ]
fallback_genre = 'Unknown'

In [29]:
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv('train_data.txt', sep=':::',
                                         header=None, names=['SerialNumber', 'MOVIE_NAME',
                                                      'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(500)

except Exception as e:
  print (f"Error loading train_data: {e}")
  raise

Loading Train Data: 500it [00:00, 855.23it/s]


In [30]:
X_train = train_data['MOVIE_PLOT'].astype (str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(',') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform (genre_labels)

In [31]:
tfidf_vectorizer = TfidfVectorizer (max_features=5000)

In [32]:
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

Vectorizing Training Data: 100%|██████████| 50/50 [00:04<00:00, 11.57it/s]


In [33]:
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier. fit (X_train_tfidf, y_train)
    pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:00<00:00, 52.20it/s]


In [34]:
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('test_data.txt', sep=':::',
                                header=None, names=['SerialNumber', 'MOVIE_NAME',
                                                    'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print (f"Error loading test_data: {e}")
    raise


Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 258.68it/s]


In [35]:
X_test = test_data['MOVIE_PLOT'].astype (str).apply(lambda doc: doc.lower())

In [36]:
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform (X_test)
    pbar.update(50)


Vectorizing Test Data: 100%|██████████| 50/50 [00:01<00:00, 27.31it/s]


In [37]:
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)


Predicting on Test Data: 100%|██████████| 50/50 [00:00<00:00, 277.05it/s]


In [38]:
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform (y_pred)
test_results = pd.DataFrame({ 'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})
test_results[ 'PREDICTED_GENRES'] = test_results[ 'PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)



In [39]:
with open("movie_genre_model_evaluation.csv", "w", newline='', encoding="utf-8") as output_file:
    csv_writer = csv.writer(output_file)
    csv_writer.writerow(['MOVIE_NAME', 'PREDICTED_GENRES'])
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ','.join(row['PREDICTED_GENRES'])
        csv_writer.writerow([movie_name, genre_str])


In [40]:
y_train_pred = multi_output_classifier.predict(X_train_tfidf)
accuracy=accuracy_score (y_train, y_train_pred)
precision = precision_score (y_train, y_train_pred, average='micro')
recall = recall_score (y_train, y_train_pred, average='micro')
f1 = f1_score (y_train, y_train_pred, average='micro')

In [41]:
metrics = {
    "Accuracy": accuracy * 100,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1
}

In [42]:
with open("movie_genre_model_evaluation.csv", "a", newline='', encoding="utf-8") as output_file:
    csv_writer = csv.writer(output_file)
    output_file.write("\n\nModel Evaluation Metrics: \n")
    for metric, value in metrics.items():
        output_file.write(f"{metric}: {value:.2f}\n")


In [43]:
print("Model evaluation results and metrics have been saved to 'model_evaluation.txt'.")

Model evaluation results and metrics have been saved to 'model_evaluation.txt'.
