In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
movies_df = pd.read_csv('movies_model.csv')
movies_df

Unnamed: 0,rating,genre,score,director,star,budget,gross,company,runtime
0,R,Drama,8.4,Stanley Kubrick,Jack Nicholson,19000000,46998772,Warner Bros.,146
1,R,Adventure,5.8,Randal Kleiser,Brooke Shields,4500000,58853106,Columbia Pictures,104
2,PG,Action,8.7,Irvin Kershner,Mark Hamill,18000000,538375067,Lucasfilm,124
3,PG,Comedy,7.7,Jim Abrahams,Robert Hays,3500000,83453539,Paramount Pictures,88
4,R,Comedy,7.3,Harold Ramis,Chevy Chase,6000000,39846344,Orion Pictures,98
...,...,...,...,...,...,...,...,...,...
5416,R,Action,6.6,Adil El Arbi,Will Smith,90000000,426505244,Columbia Pictures,124
5417,PG,Action,6.5,Jeff Fowler,Ben Schwartz,85000000,319715683,Paramount Pictures,99
5418,PG,Adventure,5.6,Stephen Gaghan,Robert Downey Jr.,175000000,245487753,Universal Pictures,101
5419,PG,Adventure,6.8,Chris Sanders,Harrison Ford,135000000,111105497,20th Century Studios,100


In [4]:
movies_y = movies_df['gross']
movies_y

0        46998772
1        58853106
2       538375067
3        83453539
4        39846344
          ...    
5416    426505244
5417    319715683
5418    245487753
5419    111105497
5420    461421559
Name: gross, Length: 5421, dtype: int64

In [5]:
movies_other= movies_df.apply(lambda x: x.mask(x.map(x.value_counts())<20,'other') if x.name=='director' else x)
movies_other= movies_other.apply(lambda x: x.mask(x.map(x.value_counts())<35, 'other') if x.name=='star' else x)
movies_other = movies_other.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'other') if x.name=='company' else x)
movies_other= movies_other.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'other') if x.name=='genre' else x)
movies_other

Unnamed: 0,rating,genre,score,director,star,budget,gross,company,runtime
0,R,Drama,8.4,other,other,19000000,46998772,Warner Bros.,146
1,R,Adventure,5.8,other,other,4500000,58853106,Columbia Pictures,104
2,PG,Action,8.7,other,other,18000000,538375067,Lucasfilm,124
3,PG,Comedy,7.7,other,other,3500000,83453539,Paramount Pictures,88
4,R,Comedy,7.3,other,other,6000000,39846344,Orion Pictures,98
...,...,...,...,...,...,...,...,...,...
5416,R,Action,6.6,other,other,90000000,426505244,Columbia Pictures,124
5417,PG,Action,6.5,other,other,85000000,319715683,Paramount Pictures,99
5418,PG,Adventure,5.6,other,other,175000000,245487753,Universal Pictures,101
5419,PG,Adventure,6.8,other,other,135000000,111105497,other,100


In [6]:
movies_x = movies_other.drop(columns=['gross'])
movies_x

Unnamed: 0,rating,genre,score,director,star,budget,company,runtime
0,R,Drama,8.4,other,other,19000000,Warner Bros.,146
1,R,Adventure,5.8,other,other,4500000,Columbia Pictures,104
2,PG,Action,8.7,other,other,18000000,Lucasfilm,124
3,PG,Comedy,7.7,other,other,3500000,Paramount Pictures,88
4,R,Comedy,7.3,other,other,6000000,Orion Pictures,98
...,...,...,...,...,...,...,...,...
5416,R,Action,6.6,other,other,90000000,Columbia Pictures,124
5417,PG,Action,6.5,other,other,85000000,Paramount Pictures,99
5418,PG,Adventure,5.6,other,other,175000000,Universal Pictures,101
5419,PG,Adventure,6.8,other,other,135000000,other,100


In [55]:
movies_x_dummies = pd.get_dummies(movies_x)
movies_x_dummies

Unnamed: 0,score,budget,runtime,rating_Approved,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,...,company_The Weinstein Company,company_Touchstone Pictures,company_TriStar Pictures,company_Twentieth Century Fox,company_United Artists,company_Universal Pictures,company_Walt Disney Animation Studios,company_Walt Disney Pictures,company_Warner Bros.,company_other
0,8.4,19000000,146,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,5.8,4500000,104,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,8.7,18000000,124,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.7,3500000,88,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7.3,6000000,98,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5416,6.6,90000000,124,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5417,6.5,85000000,99,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5418,5.6,175000000,101,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5419,6.8,135000000,100,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [56]:
X_train, X_test, y_train, y_test = train_test_split(movies_x_dummies, movies_y, random_state=1)

In [57]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [58]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [59]:
model.fit(X_train_scaled, y_train)

LinearRegression()

In [60]:
model.score(X_test_scaled, y_test)

0.5777836952558804

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
mae = mean_absolute_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"mean squared error (MSE): {mse}")
print(f"mean absolute error (MAE): {mae}")
print(f"R-squared (R2): {r2}")
