<a href="https://colab.research.google.com/github/MAHIN-098/Task-2-Movie-Rating-Prediction/blob/main/P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from google.colab import files
uploaded = files.upload()

data = pd.read_csv('P2-data.csv')
print(data.head())


data = data.dropna(subset=['Rating'])

X = data.drop(['Rating'], axis=1)
y = data['Rating']

director_success_rate = data.groupby('Director')['Rating'].mean().reset_index()
director_success_rate.columns = ['Director', 'Director_Success_Rate']
data = data.merge(director_success_rate, on='Director', how='left')

X = data.drop(['Rating', 'Name'], axis=1)
X['Director_Success_Rate'] = data['Director_Success_Rate']

X['Votes'] = X['Votes'].str.replace(',', '').str.replace('$', '').str.replace('M', '').astype(float)


X['Duration'] = X['Duration'].str.extract('(\d+)').astype(float)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['Year', 'Duration', 'Votes', 'Director_Success_Rate']
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'R^2: {r2_score(y_test, y_pred)}')

Saving P2-data.csv to P2-data (5).csv
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi) -2019.0  109 min            Drama   
2                         #Homecoming -2021.0   90 min   Drama, Musical   
3                             #Yaaram -2019.0  110 min  Comedy, Romance   
4                   ...And Once Again -2010.0  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       R