## IMDB Regression

In [18]:
import pandas as pd
import numpy as np
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso,Ridge

In [19]:
# X ve Y değişkenlerimizi oluşturma
df_combined = pd.read_csv("imdb_scraped_data_all.csv")
X = df_combined.loc[:,["Year", "Runtime", "Gross_US_Canada", "Votes", "Metascore", "estimated_revenue", "Budget"]]
y = df_combined["Rating"]


# Train/Test Ayrımı
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train/Validation Ayrımı
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=42)

In [20]:
# X ve Y değişkenlerimizi oluşturma
df_combined_test = pd.read_csv("imdb_scraped_data_all_test.csv")
X2 = df_combined_test.loc[:,["Runtime", "Budget", "VotesLog", "Metascore", "estimated_revenue_log", "Estimated_Worldwide_GrossLog", "Gross_US_Canada", "Score"]]
y2 = df_combined_test["Rating"]


# Train/Test Ayrımı
X_train2, x_test2, Y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Train/Validation Ayrımı
x_train2, x_val2, y_train2, y_val2 = train_test_split(X_train2, Y_train2, test_size=0.25, random_state=42)

In [21]:
lreg2 = LinearRegression()

lreg2.fit(x_train2, y_train2)

pred2 = lreg2.predict(x_val2)

# MSE Hesabı
mse2 = np.mean((pred2 - y_val2)**2)
print("MSE: ", mse2)

# R2 Skor
print("R2 Score: ", lreg2.score(x_val2, y_val2))

MSE:  7.946249549209668e-24
R2 Score:  1.0


In [32]:
# Modeli Oluşturma
#y = ax + b formatinda constant eksik
x_train2 = sm.add_constant(x_train2)
model_test = sm.OLS(y_train2, x_train2)

# Modeli Eğitme
fit_test = model_test.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit_test.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.651
Method:,Least Squares,F-statistic:,1084.0
Date:,"Wed, 17 May 2023",Prob (F-statistic):,0.0
Time:,22:21:10,Log-Likelihood:,-2425.1
No. Observations:,2910,AIC:,4862.0
Df Residuals:,2904,BIC:,4898.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.8575,1.547,9.607,0.000,11.825,17.890
Year,-0.0055,0.001,-7.179,0.000,-0.007,-0.004
Runtime,0.0068,0.001,11.814,0.000,0.006,0.008
Gross_US_Canada,-9.929e-10,1.75e-10,-5.665,0.000,-1.34e-09,-6.49e-10
Votes,1.185e-06,6.16e-08,19.245,0.000,1.06e-06,1.31e-06
Metascore,0.0325,0.001,49.691,0.000,0.031,0.034

0,1,2,3
Omnibus:,389.487,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1002.743
Skew:,-0.743,Prob(JB):,1.81e-218
Kurtosis:,5.462,Cond. No.,13800000000.0


In [22]:
print('X Train:', x_train.shape)
print('X Validation:', x_val.shape)
print('X test:', x_test.shape)

X Train: (2910, 7)
X Validation: (971, 7)
X test: (971, 7)


In [23]:
lreg = LinearRegression()

lreg.fit(x_train, y_train)

pred = lreg.predict(x_val)

# MSE Hesabı
mse = np.mean((pred - y_val)**2)
print("MSE: ", mse)

# R2 Skor
print("R2 Score: ", lreg.score(x_val, y_val))

MSE:  0.29278010576269936
R2 Score:  0.6427657110147325


In [24]:
# Modeli Oluşturma
#y = ax + b formatinda constant eksik
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)

# Modeli Eğitme
fit = model.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.655
Model:,OLS,Adj. R-squared:,0.654
Method:,Least Squares,F-statistic:,918.8
Date:,"Wed, 17 May 2023",Prob (F-statistic):,0.0
Time:,22:20:18,Log-Likelihood:,-2408.9
No. Observations:,2910,AIC:,4832.0
Df Residuals:,2903,BIC:,4874.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.1714,1.609,7.567,0.000,9.017,15.326
Year,-0.0042,0.001,-5.233,0.000,-0.006,-0.003
Runtime,0.0077,0.001,12.931,0.000,0.007,0.009
Gross_US_Canada,-9.439e-06,1.25e-06,-7.567,0.000,-1.19e-05,-6.99e-06
Votes,1.184e-06,6.13e-08,19.325,0.000,1.06e-06,1.3e-06
Metascore,0.0318,0.001,48.142,0.000,0.031,0.033
estimated_revenue,3.775e-06,4.99e-07,7.567,0.000,2.8e-06,4.75e-06
Budget,3.774e-06,4.99e-07,7.562,0.000,2.8e-06,4.75e-06

0,1,2,3
Omnibus:,416.77,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1069.869
Skew:,-0.792,Prob(JB):,4.8e-233
Kurtosis:,5.513,Cond. No.,7160000000000000.0


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Verilerinizi eğitim, doğrulama ve test kümelerine ayırın
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Modelinizi oluşturun
model = LinearRegression()

# Modeli eğitin
model.fit(X_train, y_train)

# Doğrulama verilerini kullanarak model performansını değerlendirin
y_val_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("MSE:", mse)
print("R2 Score:", r2)

# Test verilerini kullanarak model performansını test edin
y_test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("MSE:", mse)
print("R2 Score:", r2)

MSE: 0.29278010576269936
R2 Score: 0.6427657110147325
MSE: 0.3028277906409138
R2 Score: 0.6674207938332202


In [26]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Modeli Oluşturma
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)

# Modeli Eğitme
fit = model.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.655
Model:,OLS,Adj. R-squared:,0.654
Method:,Least Squares,F-statistic:,918.8
Date:,"Wed, 17 May 2023",Prob (F-statistic):,0.0
Time:,22:20:18,Log-Likelihood:,-2408.9
No. Observations:,2910,AIC:,4832.0
Df Residuals:,2903,BIC:,4874.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.1714,1.609,7.567,0.000,9.017,15.326
Year,-0.0042,0.001,-5.233,0.000,-0.006,-0.003
Runtime,0.0077,0.001,12.931,0.000,0.007,0.009
Gross_US_Canada,-9.439e-06,1.25e-06,-7.567,0.000,-1.19e-05,-6.99e-06
Votes,1.184e-06,6.13e-08,19.325,0.000,1.06e-06,1.3e-06
Metascore,0.0318,0.001,48.142,0.000,0.031,0.033
estimated_revenue,3.775e-06,4.99e-07,7.567,0.000,2.8e-06,4.75e-06
Budget,3.774e-06,4.99e-07,7.562,0.000,2.8e-06,4.75e-06

0,1,2,3
Omnibus:,416.77,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1069.869
Skew:,-0.792,Prob(JB):,4.8e-233
Kurtosis:,5.513,Cond. No.,7160000000000000.0


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# X ve Y değişkenlerimizi oluşturma
X1 = df_combined.loc[:,["Year", "Runtime", "Gross_US_Canada", "Votes", "Metascore"]]
y1 = df_combined["Rating"]

# Train/Test Ayrımı
X_train2, x_test2, Y_train2, y_test2 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Train/Validation Ayrımı
x_train2, x_val2, y_train2, y_val2 = train_test_split(X_train2, Y_train2, test_size=0.25, random_state=42)

print('X Train:', x_train2.shape)
print('X Validation:', x_val2.shape)
print('X test:', x_test2.shape)

X Train: (2910, 5)
X Validation: (971, 5)
X test: (971, 5)


In [28]:
import pandas as pd
import numpy as np
# Modeli Oluşturma
lreg = LinearRegression()

lreg.fit(x_train2,y_train2)

pred = lreg.predict(x_val2)

# MSE Hesabı
mse = np.mean((pred - y_val2)**2)
print("MSE: ", mse)

# R2 Skor
print("R2 Score: ", lreg.score(x_val2, y_val2))

MSE:  0.2950429863390312
R2 Score:  0.6400046677681681


In [29]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Modeli Oluşturma
x_train2 = sm.add_constant(x_train2)
model2 = sm.OLS(y_train2, x_train2)

# Modeli Eğitme
fit2 = model2.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit2.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.651
Method:,Least Squares,F-statistic:,1084.0
Date:,"Wed, 17 May 2023",Prob (F-statistic):,0.0
Time:,22:20:18,Log-Likelihood:,-2425.1
No. Observations:,2910,AIC:,4862.0
Df Residuals:,2904,BIC:,4898.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.8575,1.547,9.607,0.000,11.825,17.890
Year,-0.0055,0.001,-7.179,0.000,-0.007,-0.004
Runtime,0.0068,0.001,11.814,0.000,0.006,0.008
Gross_US_Canada,-9.929e-10,1.75e-10,-5.665,0.000,-1.34e-09,-6.49e-10
Votes,1.185e-06,6.16e-08,19.245,0.000,1.06e-06,1.31e-06
Metascore,0.0325,0.001,49.691,0.000,0.031,0.034

0,1,2,3
Omnibus:,389.487,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1002.743
Skew:,-0.743,Prob(JB):,1.81e-218
Kurtosis:,5.462,Cond. No.,13800000000.0


In [30]:
y, X = patsy.dmatrices('Rating ~ Runtime + Budget + Gross_US_Canada + Votes + Metascore + Estimated_Worldwide_Gross + estimated_revenue', data=df_combined, return_type="dataframe")

model = sm.OLS(y, X)

fit3 = model.fit()

fit3.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.652
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,1815.0
Date:,"Wed, 17 May 2023",Prob (F-statistic):,0.0
Time:,22:20:18,Log-Likelihood:,-4012.6
No. Observations:,4852,AIC:,8037.0
Df Residuals:,4846,BIC:,8076.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.7470,0.050,74.542,0.000,3.649,3.846
Runtime,0.0083,0.000,17.697,0.000,0.007,0.009
Budget,5.519e-07,7.43e-09,74.299,0.000,5.37e-07,5.66e-07
Gross_US_Canada,2.115e-07,2.84e-09,74.433,0.000,2.06e-07,2.17e-07
Votes,1.169e-06,4.81e-08,24.298,0.000,1.07e-06,1.26e-06
Metascore,0.0323,0.001,64.036,0.000,0.031,0.033
Estimated_Worldwide_Gross,-6.39e-07,8.56e-09,-74.620,0.000,-6.56e-07,-6.22e-07
estimated_revenue,5.544e-07,7.43e-09,74.656,0.000,5.4e-07,5.69e-07

0,1,2,3
Omnibus:,602.181,Durbin-Watson:,1.889
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1428.967
Skew:,-0.724,Prob(JB):,5.05e-311
Kurtosis:,5.229,Cond. No.,2.23e+16


In [31]:
model3_smf = smf.ols('Rating ~ Runtime + Budget + Gross_US_Canada + Votes + Metascore + Estimated_Worldwide_Gross + estimated_revenue + log_votes', data=df_combined)

fit3_smf = model3_smf.fit()

fit3_smf.summary()

PatsyError: Error evaluating factor: NameError: name 'log_votes' is not defined
    Rating ~ Runtime + Budget + Gross_US_Canada + Votes + Metascore + Estimated_Worldwide_Gross + estimated_revenue + log_votes
                                                                                                                      ^^^^^^^^^