In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [70]:
try:
    movie_data = pd.read_csv('IMDb.csv', encoding='utf-8')
except UnicodeDecodeError:
    movie_data = pd.read_csv('IMDb.csv', encoding='ISO-8859-1')  # Try an alternative encoding if utf-8 fails


In [71]:
# Clear out NaN values
movie_data_cleaned = movie_data.dropna()

In [72]:
# Assuming 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3' are important features, and 'Rating' is the target
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'

In [73]:
# Split the cleaned data into features and target variable
X = movie_data_cleaned[features]
y = movie_data_cleaned[target]


In [74]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [77]:

# Define preprocessing steps for categorical variables
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numeric_features = []  # Add numeric features if any

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        # Add numeric transformers if any
    ], remainder='passthrough')  # Include non-categorical features as well



In [78]:
# Build a pipeline with preprocessing and Random Forest Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


In [79]:
# Train the model
model.fit(X_train, y_train)


In [80]:
# Make predictions
y_pred = model.predict(X_test)


In [81]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.6015506627650182


In [82]:
# Function to estimate movie rating
def estimate_movie_rating(model, genre, director, actor1, actor2, actor3):
    # Create a DataFrame with the input data
    movie_data = pd.DataFrame({
        'Genre': [genre],
        'Director': [director],
        'Actor 1': [actor1],
        'Actor 2': [actor2],
        'Actor 3': [actor3]
    })

    # Use the trained model to preprocess and predict the rating
    rating = model.predict(movie_data)
    
    return rating[0]

In [83]:
# Example usage:
estimated_rating = estimate_movie_rating(model, 'Action', 'N.K. Pareek', 'Nirmal Pandey', 'Khushi Rajpoot', 'Raj Singh Verma')
print(f'Estimated Rating: {estimated_rating:.2f}')

Estimated Rating: 5.04
