# Movie prediction

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Step 1: Load the dataset
data = pd.read_csv("MOVIE DATASET.csv")

# Step 2: Explore the data
print(data.head())

      id                                         movie_name  \
0  44978                                           Super Me   
1  50185                                     Entity Project   
2  34131  Behavioral Family Therapy for Serious Psychiat...   
3  78522                                      Blood Glacier   
4   2206                                      Apat na anino   

                                            synopsis    genre  
0  A young scriptwriter starts bringing valuable ...  fantasy  
1  A director and her friends renting a haunted h...   horror  
2  This is an educational video for families and ...   family  
3  Scientists working in the Austrian Alps discov...    scifi  
4  Buy Day - Four Men Widely - Apart in Life - By...   action  


In [8]:
# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Drop rows with missing values (optional)
data.dropna(inplace=True)

# Step 3: Text Preprocessing (synopsis)
# Use TfidfVectorizer to convert the movie synopsis into numerical features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(data['synopsis'])

# Step 4: Encode the target variable 'genre'
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['genre'])


Missing values:
id            0
movie_name    0
synopsis      0
genre         0
dtype: int64


In [11]:
el = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.2735


In [12]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Step 9: Predict a new movie genre (example)
sample_movie = ["A thrilling action movie with lots of fight scenes and stunts."]
sample_tfidf = tfidf.transform(sample_movie)
predicted_genre = model.predict(sample_tfidf)
print(f"\nPredicted Genre: {label_encoder.inverse_transform(predicted_genre)}")


Classification Report:
              precision    recall  f1-score   support

      action       0.21      0.16      0.19      1643
   adventure       0.19      0.17      0.18      1597
       crime       0.29      0.32      0.31      1672
      family       0.29      0.38      0.33      1589
     fantasy       0.19      0.14      0.16      1613
      horror       0.32      0.34      0.33      1656
     mystery       0.20      0.22      0.21      1578
     romance       0.38      0.53      0.44      1594
       scifi       0.34      0.40      0.37      1612
    thriller       0.15      0.08      0.10      1646

    accuracy                           0.27     16200
   macro avg       0.26      0.27      0.26     16200
weighted avg       0.26      0.27      0.26     16200


Predicted Genre: ['action']
