In [10]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the dataset from the URL
url = 'https://raw.githubusercontent.com/sundeepblue/movie_rating_prediction/master/movie_metadata.csv'
df = pd.read_csv(url)

# Select relevant columns and drop rows with missing values
df = df[['plot_keywords', 'genres']].dropna()

# Convert genres to a list of strings (if not already)
df['genres'] = df['genres'].apply(lambda x: x.split('|'))

# Preprocess text in plot_keywords column
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [token for token in tokens if token.isalpha()]  # Remove punctuation
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['clean_plot_keywords'] = df['plot_keywords'].apply(preprocess_text)

# Use MultiLabelBinarizer to convert genres into binary format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# Use TfidfVectorizer for text feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(df['clean_plot_keywords'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a RandomForestClassifier (or any other classifier of your choice)
clf = RandomForestClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Convert predictions back to original genre labels using inverse_transform
y_pred_labels = mlb.inverse_transform(y_pred)
y_test_labels = mlb.inverse_transform(y_test)

# Evaluate the model
report = classification_report(y_test, y_pred, target_names=mlb.classes_)
print(report)


              precision    recall  f1-score   support

      Action       0.46      0.10      0.16       230
   Adventure       0.41      0.09      0.15       193
   Animation       0.00      0.00      0.00        47
   Biography       0.14      0.03      0.05        68
      Comedy       0.56      0.26      0.35       372
       Crime       0.47      0.11      0.17       171
 Documentary       0.00      0.00      0.00        21
       Drama       0.54      0.80      0.65       503
      Family       0.31      0.03      0.06       122
     Fantasy       0.35      0.04      0.08       134
   Film-Noir       0.00      0.00      0.00         1
   Game-Show       0.00      0.00      0.00         0
     History       0.10      0.03      0.04        37
      Horror       0.48      0.12      0.20       104
       Music       0.17      0.03      0.04        40
     Musical       0.00      0.00      0.00        23
     Mystery       0.38      0.08      0.13       103
        News       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
