In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
import zipfile
import os

# Extract the dataset
zip_file_path = '/content/movies.zip'
extraction_dir = '/content/movies'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# Function to load and parse the data
def load_data(file_path):
    data = {"ID": [], "Title": [], "Year": [], "Genre": [], "Plot": []}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                data["ID"].append(parts[0])
                title_year = parts[1]
                title, year = title_year.rsplit(" (", 1)
                data["Title"].append(title)
                data["Year"].append(year.strip(")"))
                data["Genre"].append(parts[2])
                data["Plot"].append(parts[3])
    return pd.DataFrame(data)

# Define the paths to the data files
train_data_path = os.path.join(extraction_dir, 'Genre Classification Dataset', 'train_data.txt')
test_data_path = os.path.join(extraction_dir, 'Genre Classification Dataset', 'test_data.txt')

# Load the training data
df_train = load_data(train_data_path)

# Preprocess and vectorize the plot summaries
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_train['Plot'])
y = df_train['Genre']

# Address class imbalance by oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred, zero_division=1))

# Load the test data
df_test = load_data(test_data_path)

# Check if the test data is loaded correctly
print("First few rows of the test data:\n", df_test.head())

# Vectorize the test data and make predictions
if not df_test.empty:
    X_test = vectorizer.transform(df_test['Plot'])
    test_predictions = model.predict(X_test)

    # Save the predictions
    df_test['Predicted_Genre'] = test_predictions
    df_test[['ID', 'Predicted_Genre']].to_csv('genre_predictions.csv', index=False)
else:
    print("Test data is empty or not loaded correctly.")


Accuracy: 0.9123668566609079
Classification Report:
               precision    recall  f1-score   support

      action       0.90      0.89      0.89      2626
       adult       0.94      0.99      0.96      2761
   adventure       0.95      0.91      0.93      2684
   animation       0.98      0.98      0.98      2790
   biography       0.97      1.00      0.98      2710
      comedy       0.82      0.60      0.69      2712
       crime       0.93      0.98      0.96      2691
 documentary       0.69      0.67      0.68      2667
       drama       0.67      0.48      0.56      2775
      family       0.94      0.93      0.94      2696
     fantasy       0.97      0.99      0.98      2854
   game-show       0.98      1.00      0.99      2700
     history       0.97      1.00      0.98      2763
      horror       0.86      0.95      0.90      2697
       music       0.91      0.97      0.94      2696
     musical       0.97      1.00      0.98      2718
     mystery       0.97     