### Import Libraries

In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


### Load Dataset

In [27]:
df = pd.read_csv("imdb_top_1000.csv")
df.head()


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


### Data cleaning

In [28]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [29]:
df = df[['Genre',
         'Runtime',
         'No_of_Votes',
         'Gross',
         'IMDB_Rating']]


In [30]:
df.rename(columns={
    'Genre': 'genre',
    'Runtime': 'runtime',
    'No_of_Votes': 'vote_count',
    'Gross': 'gross',
    'IMDB_Rating': 'rating'
}, inplace=True)


#### Clean runtime column

In [31]:
df['runtime'] = df['runtime'].str.replace(' min', '').astype(int)


#### Clean gross column

In [32]:
# Remove rows where Gross is missing
df.dropna(inplace=True)

# Remove commas and convert to numeric
df['gross'] = df['gross'].str.replace(',', '').astype(int)


#### Genre fix (will take only the first genre between ",")

In [33]:
df['genre'] = df['genre'].apply(lambda x: x.split(',')[0])


##### Encode

In [34]:
le = LabelEncoder()
df['genre'] = le.fit_transform(df['genre'])



#### Dataset Check

In [35]:
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 831 entries, 0 to 997
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   genre       831 non-null    int64  
 1   runtime     831 non-null    int64  
 2   vote_count  831 non-null    int64  
 3   gross       831 non-null    int64  
 4   rating      831 non-null    float64
dtypes: float64(1), int64(4)
memory usage: 39.0 KB


### Feature Split

In [36]:
X = df.drop('rating', axis=1)
y = df['rating']

### Train - test split

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Linear Regression

In [38]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Linear Regression Results")
print("MAE :", mean_absolute_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R2  :", r2_score(y_test, y_pred_lr))


Linear Regression Results
MAE : 0.19556637056494486
RMSE: 0.2331730832860218
R2  : 0.3894881005992281


### SVR 

In [39]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [40]:
svr = SVR(kernel='rbf', C=10, gamma='scale')
svr.fit(X_train_scaled, y_train)

y_pred_svr = svr.predict(X_test_scaled)

print("SVR Results")
print("MAE :", mean_absolute_error(y_test, y_pred_svr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_svr)))
print("R2  :", r2_score(y_test, y_pred_svr))


SVR Results
MAE : 0.1811776886896185
RMSE: 0.2226242205459524
R2  : 0.4434782706601821


### Random Forest

In [41]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Results")
print("MAE :", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R2  :", r2_score(y_test, y_pred_rf))


Random Forest Results
MAE : 0.17758083832335406
RMSE: 0.21694788137599574
R2  : 0.47149618227790846


### Model Comparision

In [42]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "SVR", "Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_lr),
        mean_absolute_error(y_test, y_pred_svr),
        mean_absolute_error(y_test, y_pred_rf)
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test, y_pred_lr)),
        np.sqrt(mean_squared_error(y_test, y_pred_svr)),
        np.sqrt(mean_squared_error(y_test, y_pred_rf))
    ],
    "R2 Score": [
        r2_score(y_test, y_pred_lr),
        r2_score(y_test, y_pred_svr),
        r2_score(y_test, y_pred_rf)
    ]
})

results


Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,0.195566,0.233173,0.389488
1,SVR,0.181178,0.222624,0.443478
2,Random Forest,0.177581,0.216948,0.471496


### Final Conclusion

The performance of different regression models was evaluated using MAE, RMSE, and RÂ² score.
Linear Regression provided baseline results.
SVR improved prediction accuracy by handling non-linear patterns.
Random Forest Regressor achieved the best performance due to its ensemble learning capability and robustness.