In [29]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(r"A:\CU\PythonProjects\Pro1\IMDb Movies India.csv",encoding='latin')

In [3]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
df.shape

(15509, 10)

In [5]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [6]:
df.isnull().sum().sum()

33523

In [7]:
df.dropna(inplace=True)

In [8]:
df['Year']=df['Year'].str.extract(r'([0-9].{0,3})',expand=False)

In [9]:
df['Duration'] = df['Duration'].str.extract(r'([0-9]+)', expand=False)

In [10]:
df['Duration']=pd.to_numeric(df['Duration'])
df['Year']=pd.to_numeric(df['Year'])

In [11]:
df['Votes']=df['Votes'].str.extract(r'([0-9]+)',expand=False)
df['Votes']=pd.to_numeric(df['Votes'])

In [12]:
X = df[['Year', 'Actor 1', 'Actor 2', 'Duration', 'Genre', 'Votes', 'Director']]
y = df['Rating']
print(X)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Actor 1', 'Actor 2', 'Genre', 'Director']),
        ('num', Pipeline(steps=[
            ('scaler',StandardScaler())
        ]), ['Year', 'Duration', 'Votes'])

    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

       Year          Actor 1                 Actor 2  Duration  \
1      2019     Rasika Dugal          Vivek Ghamande       109   
3      2019          Prateik              Ishita Raj       110   
5      1997       Bobby Deol  Aishwarya Rai Bachchan       147   
6      2005  Jimmy Sheirgill          Minissha Lamba       142   
8      2012        Yash Dave          Muntazir Ahmad        82   
...     ...              ...                     ...       ...   
15493  2015    Vicky Kaushal         Sarah Jane Dias       115   
15494  2001   Karisma Kapoor                   Rekha       153   
15503  1989      Chiranjeevi              Jayamalini       125   
15505  1999     Akshay Kumar          Twinkle Khanna       129   
15508  1998       Dharmendra              Jaya Prada       130   

                           Genre  Votes         Director  
1                          Drama      8    Gaurav Bakshi  
3                Comedy, Romance     35       Ovais Khan  
5         Comedy, Drama, Music

In [24]:
df.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
ypred=model.predict(X_test)

In [30]:
MSE=mean_squared_error(y_test,ypred)
print(MSE)

4.693573540345248


In [43]:
import joblib
joblib.dump(model,"model.jb")

['model.jb']