In [None]:
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, RidgeCV, Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

import myfunc

%matplotlib inline

# Feature impotance

In [None]:
# Prepares movie data.

movie_set = myfunc.make_movie_set()

In [None]:
# Alternate to preparing movie data - loads prepared movie data.

movie_set = pd.read_csv('data/reg_movie_set.csv')

In [None]:
# Sets up the train-test split
X = movie_set.drop('Gross', axis=1)
y = movie_set['Gross']

x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=99)

y_train = y_train.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1, 1)

# Sanity check
print('Train - Predictors shape', x_train.shape)
print('Test - Predictors shape', x_test.shape)
print('Train - Target shape', y_train.shape)
print('Test - Target shape', y_test.shape)

In [None]:
# Scales the training data.

scaler = StandardScaler()

x_train_standard = scaler.fit_transform(x_train)
x_val_standard = scaler.transform(x_val)

In [None]:
# Gets a score from a linear regression - starting point.

linear_regression = LinearRegression()
linear_regression.fit(x_train_standard, y_train)
linear_regression.score(x_train_standard, y_train)

In [None]:
linear_regression.score(x_val_standard, y_val)

In [None]:
# Runs LassoCV on scaled data.

kfold = KFold(shuffle=True, random_state=2**32-1)
lasso_cv = LassoCV(alphas=np.arange(734480, 734490, 1), normalize=False, cv=kfold)
lasso_cv.fit(x_train_standard, y_train)

In [None]:
alphas = lasso_cv.alphas_
errors = lasso_cv.mse_path_
n_folds = 5

df_errors = pd.DataFrame(index=alphas, columns=[f'fold_{i+1}' for i in range(n_folds)], data=errors)
df_errors['error_mean'] = df_errors.mean(axis=1)
df_errors.sort_values('error_mean').head()

In [None]:
lasso_cv.alpha_

In [None]:
lasso_cv.score(x_val_standard, y_val)

In [None]:
plt.figure(figsize=(10, 5))

g = sns.lineplot(data=df_errors,
                 x=df_errors.index,
                 y='error_mean')

# Show the selected alpha, above
plot_alpha = 734485
g.vlines(plot_alpha, df_errors['error_mean'].min(), df_errors['error_mean'].max(), 
         color='red', linestyles='dashed')

g.set_xlabel('alpha')
g.set_ylabel('$\Sigma_{i \in splits}(MSE_i)$');

In [None]:
cols = movie_set.drop('Gross', axis=1).columns
pd.Series(index=cols, data=lasso_cv.coef_)

# RE run linear regression

In [None]:
drop_1 = ['WarnerBros.', 'PG-13', 'R', 1, 8, 'DirMS', 'Animation', 'Romance', 'Thea2']

x_train2 = x_train.drop(columns=drop_1)
y_train2 = y_train.copy()

x_val2 = x_val.drop(columns=drop_1)
y_val2 = y_val.copy()
scaler = StandardScaler()
x_train2_scaled = scaler.fit_transform(x_train2)
x_val2_scaled = scaler.transform(x_val2)

linear_regression = LinearRegression()
linear_regression.fit(x_train2_scaled, y_train2)
linear_regression.score(x_train2_scaled, y_train2)

In [None]:
linear_regression.score(x_val2_scaled, y_val2)

In [None]:
for i in range(-10,5):
    ridge = Ridge(alpha=10**i)
    ridge.fit(x_train_standard, y_train)
    print(ridge.coef_, ridge.score(x_val_standard, y_val))

In [None]:
ridge_cv = RidgeCV()
ridge_cv.fit(x_train_standard, y_train)
print(ridge_cv.coef_, ridge_cv.score(x_val_standard, y_val))

pd.Series(index=cols, data=ridge_cv.coef_[0])

In [None]:
for i in range(-10,5):
    lasso = Lasso(alpha=10**i)
    lasso.fit(x_train_standard, y_train)
    print(lasso.coef_, lasso.score(x_val_standard, y_val))

In [None]:
lasso_cv2 = LassoCV()
lasso_cv2.fit(x_train2_scaled, y_train2)
print(lasso_cv2.score(x_val2_scaled, y_val))

cols = x_train2.columns
pd.Series(index=cols, data=lasso_cv2.coef_)

In [None]:
drop_2 = ['TwentiethCenturyFox', 4, 'Adventure', 'Fantasy', 'Sport', 'Mystery']
x_train3 = x_train2.drop(columns=drop_2)
y_train3 = y_train2.copy()

x_val3 = x_val2.drop(columns=drop_2)
y_val3 = y_val2.copy()
scaler = StandardScaler()
x_train3_standard = scaler.fit_transform(x_train3)
x_val3_standard = scaler.transform(x_val3)

linear_regression = LinearRegression()
linear_regression.fit(x_train3_standard, y_train3)
linear_regression.score(x_train3_standard, y_train3)

In [None]:
linear_regression.score(x_val3_standard, y_val3)

In [None]:
lasso_cv3 = LassoCV()
lasso_cv3.fit(x_train3_standard, y_train3)
print(lasso_cv3.score(x_val3_standard, y_val3))

cols = x_train3.columns
pd.Series(index=cols, data=lasso_cv3.coef_)

In [None]:
from sklearn.model_selection import cross_val_score
lm = LinearRegression()

cross_val_score(lm, X, y, # estimator, features, target
                cv=5, # number of folds 
                scoring='r2') # scoring metric

# Run model on test set

In [None]:
final_x_test = x_test.drop(columns=['WarnerBros.', 'PG-13', 'R', 1, 8, 'DirMS', 'Animation', 'Romance', 'Thea2', 'TwentiethCenturyFox', 4, 'Adventure', 'Fantasy', 'Sport', 'Mystery'])
final_x_test_standard = scaler.transform(final_x_test)
linear_regression.score(final_x_test_standard, y_test)

In [None]:
final_x_test = x_test.drop(columns=['WarnerBros.', 'PG-13', 'R', 1, 8, 'DirMS', 'Animation', 'Romance', 'Thea2', 'TwentiethCenturyFox', 4, 'Adventure', 'Fantasy', 'Sport', 'Mystery'])
final_x_test_standard = scaler.transform(final_x_test)
test_set_pred = linear_regression.predict(final_x_test_standard)

plt.figure(figsize=(10, 6))
plt.scatter(test_set_pred, y_test, alpha=.5)
plt.axhline(0, linestyle='--', color='gray')
plt.xlabel('Predicted Values', fontsize=18)
plt.ylabel('Residuals', fontsize=18);
plt.title('Linear-Regression Model Residual Plot')

In [None]:
final_x_test.columns

In [None]:
greyhound = pd.Series([7.0, 64, 50300000, 4300, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7.0, 1, 0, 0, 0, 1, 0, 0, 0, 0]).values.reshape(1, -1)
greyhound_scaled = scaler.transform(greyhound)
greyhound_gross = linear_regression.predict(greyhound)

In [None]:
greyhound_gross

In [None]:
tenet = pd.Series([7.8, 69, 205000000, 4300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7.21, 1, 1, 0, 0, 0, 0, 0, 0, 0]).values.reshape(1, -1)
tenet_scaled = scaler.transform(tenet)
tenet_gross = linear_regression.predict(tenet)

In [None]:
tenet_gross

In [None]:
spenser = pd.Series([5.4, 66, 200000000, 4300, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7.2, 1, 0, 0, 0, 1, 0, 0, 0, 0]).values.reshape(1, -1)
spenser_scaled = scaler.transform(spenser)
spenser_gross = linear_regression.predict(spenser)

In [None]:
spenser_gross

In [None]:
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=2**32-1)

In [None]:
grid = GridSearchCV(estimator=Lasso(normalize=True),  # must be an sklearn estimator
                    param_grid={'alpha': np.arange(1000, 1000000, 1000)},
                    cv=kfold,
                    scoring=make_scorer(mean_squared_error, greater_is_better=False),
                    return_train_score=True)
grid.fit(x_train_standard, y_train)
grid.best_estimator_

In [None]:
grid.best_score_

This is promising - When I run this over x_standard, y_standard instead of x_train, y_train, score is very high.  Run on x_train_standard.

# Polynomial fit - pair_regressions2
# X_standard is x_train scaled.



In [None]:
linear_regression = LinearRegression()
x_train_for_poly = x_train.copy()
x_val_for_poly = x_val.copy()
p = PolynomialFeatures(degree=2, interaction_only=True)
x_train_poly = p.fit_transform(x_train_for_poly)
x_val_poly = p.transform(x_val_for_poly)
linear_regression.fit(x_train_poly, y_train)
linear_regression.score(x_train_poly, y_train)

In [None]:

linear_regression.score(x_val_poly, y_val)

In [None]:
lasso_cv2.alpha_

In [None]:
Is this ready for running test set?
Do I nead to scale test set?


# Do I run prediction on scaled inputs?

In [None]:
greyhound = pd.Series([7.0, 64, 50300000, 4300, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 7.0, 77, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]).values.reshape(1, 43)
m.predict(greyhound)

In [None]:
m.score(p.transform(x_test),y_test)

In [None]:
# Polynomial fit - pair_regressions2
p = PolynomialFeatures(degree=2,interaction_only=True)
x_train_poly = p.fit_transform(x_train)
m.fit(x_train_poly,y_train)
m.score(x_train_poly,y_train)

In [None]:
m.score(p.transform(x_test),y_test)

In [None]:
movie_set_corr = movie_set.corr()

In [None]:
heat_map_df = movie_set[['Gross', 'UserRating', 'Metascore', 'Budget', 'Theaters', 'WaltDisneyStudiosMotionPictures', 'PG-13', 'DirUR', 'DirMS', 'Action', 'Adventure', 'Sci-Fi']]
heat_map_corr = heat_map_df.corr()

In [None]:
plt.figure(figsize=(20, 20))
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
sns.heatmap(heat_map_corr,
            cmap="seismic", annot=True, vmin=-1, vmax=1)
plt.gca().set_ylim(len(heat_map_corr)+0.5, -0.5)

In [None]:
# sns.pairplot(movie_set, height=1.5, aspect=1)
heat_map_df = heat_map_df[['UserRating', 'Metascore', 'Budget', 'Theaters', 'DirUR', 'DirMS', 'Gross']]
g = sns.PairGrid(heat_map_df.sample(frac=0.6), diag_sharey=False, corner=True)
g.map_lower(sns.scatterplot)
g.map_diag(sns.distplot, kde=False)

In [None]:
lr = LinearRegression()

X = movie_set['Budget'].values.reshape(-1, 1)
# X = movie_set['Theaters', 'UserRating', 'MetaScore', 'Budget', 'PG', 'PG-13', 'R', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'].values.reshape(-1, 1)
y = movie_set['Gross']

lr.fit(X, y)
lr.score(X, y)

In [None]:
lr.predict([[100000000],[200000000]])

In [None]:
X = movie_set[['Budget', 'DirUR', 'Theaters']]
y = movie_set['Gross']

lr.fit(X, y)
lr.score(X, y)

In [None]:
lr.predict([[100000000, 7.9, 4300]])

In [None]:
X = movie_set.copy()
del X['Gross']
y = movie_set['Gross']

lr_full.fit(X, y)
lr_full.score(X, y)

In [None]:
sm.add_constant(X).head()

In [None]:
model = sm.OLS(y, sm.add_constant(X))

fit = model.fit()
fit.summary()

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(fit.predict(), fit.resid)

plt.axhline(0, linestyle='--', color='gray')
plt.xlabel('Predicted Values', fontsize=18)
plt.ylabel('Residuals', fontsize=18);

In [None]:
plt.figure(figsize=(10, 6))
sns.jointplot(x=movie_set['Theaters'], y=movie_set['Gross'], data=movie_set, kind='reg');

## Splitting data into test and train

In [None]:
x_train

In [None]:
prediction = cls.predict(x_test)

In [None]:
cls.get_params()

In [None]:
print('Co-efficient of linear regression',cls.coef_)
print('Intercept of linear regression model',cls.intercept_)
print('Mean Square Error', metrics.mean_squared_error(y_test, prediction))
print('Model R^2 Square value', metrics.r2_score(y_test, prediction))

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x_test, y_test)
plt.plot(x_test, prediction, color='red', linewidth=3)
plt.xlabel('Hours')
plt.ylabel('Marks')
plt.title('Linear Regression');

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(cls.predict(x_test), cls.predict(x_test) - y_test, c='g', s = 40)
plt.hlines(y=0, xmin=0, xmax=100)
plt.title('Residual plot')
plt.ylabel('Residual');

## Polynomial Regression

In [None]:
lr_full = LinearRegression()

X = movie_set[['Theaters', 'WaltDisneyStudiosMotionPictures', 'UniversalPictures', 'TwentiethCenturyFox', 'SonyPicturesEntertainment(SPE)', 'ParamountPictures', 'WarnerBros.']]
y = movie_set['Gross']

lr_full.fit(X, y)
lr_full.score(X, y)

In [None]:
lr_full.predict([[4300,1,0,0,0,0,0]])

In [None]:
p = PolynomialFeatures()

X_poly = p.fit_transform(X)

lr_full = LinearRegression()
lr_full.fit(X_poly, y)
lr_full.score(X_poly, y)

In [None]:
X.shape
X_poly.shape

In [None]:
dataset = movie_set
X = dataset[['Theaters']].values  
y = dataset.loc[:,'Gross'].values

plt.style.use('default')

# fig = plt.figure(figsize=(12, 4))# fig = plt.figure(figsize=(12, 4))

# ax1 = fig.add_subplot(131, projection='3d')
# ax2 = fig.add_subplot(132, projection='3d')
plt.style.use('default')

# fig = plt.figure(figsize=(12, 4)), 2))
y = np.reshape(y, (-1, 1))
# fitting the linear regression model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
 
# visualising the linear regression model
plt.scatter(X,y, color='red')
plt.plot(X, lin_reg.predict(X),color='blue')
plt.title("Gross by Theaters (Linear)")
plt.xlabel('Number of Theaters')
plt.ylabel('Gross')
plt.show()
 
# polynomial regression model
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)

lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y)
 
 
# visualising polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y)
  
X_grid = np.arange(min(X),max(X),0.1)
X_grid = X_grid.reshape(len(X_grid),1) 
plt.scatter(X,y, color='red') 
  
plt.plot(X_grid, lin_reg2.predict(poly_reg.fit_transform(X_grid)),color='blue') 
  
plt.title("Gross by Theaters (Polynomial)")
plt.xlabel('Number of Theaters')
plt.ylabel('Gross')
plt.show()

In [None]:
dataset = movie_set
X = dataset[['UserRating']].values  
y = dataset.loc[:,'Gross'].values

plt.style.use('default')

# fig = plt.figure(figsize=(12, 4))# fig = plt.figure(figsize=(12, 4))

# ax1 = fig.add_subplot(131, projection='3d')
# ax2 = fig.add_subplot(132, projection='3d')
plt.style.use('default')

# fig = plt.figure(figsize=(12, 4)), 2))
y = np.reshape(y, (-1, 1))
# fitting the linear regression model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
 
# visualising the linear regression model
plt.scatter(X,y, color='red')
plt.plot(X, lin_reg.predict(X),color='blue')
plt.title("Gross by User Rating (Linear)")
plt.xlabel('User Rating')
plt.ylabel('Gross')
plt.show()
 
# polynomial regression model
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)

lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y)
 
 
# visualising polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y)
  
X_grid = np.arange(min(X),max(X),0.1)
X_grid = X_grid.reshape(len(X_grid),1) 
plt.scatter(X,y, color='red') 
  
plt.plot(X_grid, lin_reg2.predict(poly_reg.fit_transform(X_grid)),color='blue') 
  
plt.title("Gross by User Rating (Polynomial)")
plt.xlabel('Number of User Rating')
plt.ylabel('Gross')
plt.show()

In [None]:
# dataset = movie_set
# X = dataset[['Theaters','WaltDisneyStudiosMotionPictures']].values  
# Y = dataset.loc[:,'Gross'].values
# X = np.reshape(X, (-1, 2))
# Y = np.reshape(y, (-1, 1))

# x = X[:, 0]
# y = X[:, 1]
# z = Y
# x = np.reshape(x, (-1, 1))
# y = np.reshape(y, (-1, 1))

# xx_pred, yy_pred = np.meshgrid(dataset['Theaters'], dataset['WaltDisneyStudiosMotionPictures'])
# model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T

# # fitting the linear regression model
# from sklearn.linear_model import LinearRegression
# lin_reg = LinearRegression()
# model = lin_reg.fit(X, Y)
# predicted = model.predict(model_viz)
# r2 = model.score(X, Y)

# plt.style.use('default')

# fig = plt.figure(figsize=(12, 4))

# ax1 = fig.add_subplot(131, projection='3d')
# ax2 = fig.add_subplot(132, projection='3d')
# ax3 = fig.add_subplot(133, projection='3d')

# axes = [ax1, ax2, ax3]

# for ax in axes:
#     # ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
#     ax.scatter(xx_pred.flatten(), yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s=20, edgecolor='#70b3f0')
#     ax.set_xlabel('Porosity (%)', fontsize=12)
#     ax.set_ylabel('Brittleness', fontsize=12)
#     ax.set_zlabel('Gas Prod. (Mcf/day)', fontsize=12)
#     ax.locator_params(nbins=4, axis='x')
#     ax.locator_params(nbins=5, axis='x')

# ax1.text2D(0.2, 0.32, 'aegis4048.github.io', fontsize=13, ha='center', va='center',
#            transform=ax1.transAxes, color='grey', alpha=0.5)
# ax2.text2D(0.3, 0.42, 'aegis4048.github.io', fontsize=13, ha='center', va='center',
#            transform=ax2.transAxes, color='grey', alpha=0.5)
# ax3.text2D(0.85, 0.85, 'aegis4048.github.io', fontsize=13, ha='center', va='center',
#            transform=ax3.transAxes, color='grey', alpha=0.5)

# ax1.view_init(elev=28, azim=120)
# ax2.view_init(elev=4, azim=114)
# ax3.view_init(elev=60, azim=165)

# fig.suptitle('$R^2 = %.2f$' % r2, fontsize=20)

# fig.tight_layout()

In [None]:
r2 ** 0.5

In [None]:
predicted.shape