In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
import spacy

In [2]:
pip install spacy

Collecting spacy
  Downloading spacy-3.8.3-cp310-cp310-win_amd64.whl (12.2 MB)
     ---------------------------------------- 12.2/12.2 MB 4.2 MB/s eta 0:00:00
Collecting weasel<0.5.0,>=0.1.0
  Downloading weasel-0.4.1-py3-none-any.whl (50 kB)
     ---------------------------------------- 50.3/50.3 kB 2.5 MB/s eta 0:00:00
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.10.4-py3-none-any.whl (431 kB)
     -------------------------------------- 431.8/431.8 kB 5.4 MB/s eta 0:00:00
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.11-cp310-cp310-win_amd64.whl (25 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.5.0-py3-none-any.whl (182 kB)
     -------------------------------------- 183.0/183.0 kB 2.7 MB/s eta 0:00:00
Collecting wasabi<1.2.0,>=0.9.1
  Downloading wasabi-1.1.3-py3-none-any.whl (27 kB)
Collecting cymem<2.1.0,>=2.0.2
  Down

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
data = pd.read_csv('tmdb_5000_movies.csv')

In [6]:
print("Dataset Overview:")
print(data.head())

Dataset Overview:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2 

In [7]:
# Preprocessing: Extract genres
def extract_genres(genre_str):
    """
    Extracts the first genre from the genre string in JSON format.
    """
    try:
        genres = ast.literal_eval(genre_str)  # Convert the string representation of a list into a Python list
        if genres:
            return genres[0]['name']  # Return the name of the first genre
    except Exception as e:
        return None
    return None


In [8]:
data['main_genre'] = data['genres'].apply(extract_genres)

In [9]:
data = data.dropna(subset=['main_genre'])

In [10]:
# Clean and preprocess the text in the 'overview' column
def clean_text(text):
    """
    Cleans text by removing special characters, numbers, and extra spaces.
    """
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespaces
    return text


In [11]:
data['cleaned_overview'] = data['overview'].fillna('').apply(clean_text)

In [14]:
spacy.cli.download('en_core_web_sm')  # Downloads the model
nlp = spacy.load('en_core_web_sm')


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


SyntaxError: invalid syntax (1553972540.py, line 1)

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text(text):
    """
    Tokenizes, lemmatizes, and removes stopwords from the text.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return ' '.join(tokens)

In [17]:
data['processed_overview'] = data['cleaned_overview'].apply(preprocess_text)

In [18]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['processed_overview']).toarray()


In [19]:
# Target variable (main_genre)
y = data['main_genre']

In [20]:
# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:
# Train a Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


In [22]:
# Predict on the test set
y_pred = classifier.predict(X_test)

In [23]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                 precision    recall  f1-score   support

         Action       0.37      0.50      0.42       151
      Adventure       0.22      0.03      0.05        68
      Animation       0.00      0.00      0.00        25
         Comedy       0.38      0.48      0.42       208
          Crime       0.00      0.00      0.00        39
    Documentary       0.50      0.44      0.47        18
          Drama       0.35      0.66      0.46       242
         Family       0.00      0.00      0.00        11
        Fantasy       0.00      0.00      0.00        23
        History       0.00      0.00      0.00         5
         Horror       0.60      0.15      0.24        60
          Music       0.00      0.00      0.00         7
        Mystery       0.00      0.00      0.00         8
        Romance       0.00      0.00      0.00        21
Science Fiction       0.00      0.00      0.00        19
       TV Movie       0.00      0.00      0.00         1
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.36858638743455496
