Install the necessary libraries

In [1]:
%pip install pandas 
%pip install scikit-learn 
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Step 2: Data Preprocessing and Cleaning Code    
 The goal of this step is to convert the original text into a clean list of words

In [2]:
import pandas as pd
import re
import nltk
import ast
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download the stopwords dataset
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Function to parse and extract the primary genre
def get_first_genre(genre_str):
    if pd.isna(genre_str):
        return "Unknown"
    try:
        # Convert string representation of a list "['Drama', 'Crime']" into an actual list ['Drama', 'Crime']
        genre_list = ast.literal_eval(genre_str)
        if isinstance(genre_list, list) and len(genre_list) > 0:
            return genre_list[0]
        return "Unknown"
    except (ValueError, SyntaxError):
        # If parsing fails (e.g., non-standard list string format), attempt simple regex extraction
        found = re.findall(r"'\s*([^']+)\s*'", str(genre_str))
        if found:
            return found[0]
        return "Unknown"
    
# Function to clean and preprocess text
def clean_text(text):
    if pd.isna(text):
        return ""
    # 1. Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    # 2. Convert text to lowercase
    text = text.lower()
    # 3. Tokenization (split text into words)
    words = text.split()
    # 4. Remove stopwords and apply Porter Stemming
    words = [ps.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

# Load Kaggle dataset
df = pd.read_csv('movies_info.csv') 

# Apply the processing functions
df['main_genre'] = df['genres'].apply(get_first_genre)
df['clean_overview'] = df['overview'].apply(clean_text)

# Display processed results
print("First few rows of the processed data:")
print(df[['original_title', 'main_genre', 'clean_overview']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


First few rows of the processed data:
             original_title main_genre  \
0  The Shawshank Redemption      Drama   
1             The Godfather      Drama   
2     The Godfather Part II      Drama   
3          Schindler's List      Drama   
4              12 Angry Men      Drama   

                                      clean_overview  
0  imprison doubl murder wife lover upstand banke...  
1  span year chronicl fiction italian american co...  
2  continu saga corleon crime famili young vito c...  
3  true stori businessman oskar schindler save th...  
4  defens prosecut rest juri file juri room decid...  


Step 3: Model Training Code     
 We will use TF-IDF to convert text to numbers and SVM for classification, which is a well-performing model for movie genre prediction

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import joblib

# 1. Feature Extraction (TF-IDF)
# We convert the text into 5,000 important word features that the computer can understand
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_overview'])
y = df['main_genre']

# 2. Split Dataset (80% Training, 20% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train SVM Model
# SVM performs exceptionally well in short-text classification (such as movie synopses)
model = LinearSVC()
model.fit(X_train, y_train)

# 4. Evaluate Model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

# 5. Save the Model 
joblib.dump(model, 'movie_genre_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("Model and Vectorizer saved successfully!")

Model Accuracy: 0.40

Detailed Classification Report:
                 precision    recall  f1-score   support

         Action       0.40      0.46      0.43       246
      Adventure       0.23      0.21      0.22        80
      Animation       0.32      0.31      0.31       108
         Comedy       0.43      0.57      0.49       392
          Crime       0.17      0.14      0.15        74
          Drama       0.43      0.54      0.48       472
         Family       0.29      0.12      0.17        57
        Fantasy       0.22      0.05      0.08        41
        History       0.00      0.00      0.00        19
         Horror       0.54      0.52      0.53       191
          Music       0.00      0.00      0.00        15
        Mystery       0.29      0.05      0.09        39
        Romance       0.27      0.08      0.12        75
Science Fiction       0.18      0.17      0.18        46
       TV Movie       0.00      0.00      0.00         1
       Thriller       0.19      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
