In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut, LeavePOut, validation_curve, learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
import seaborn as sns

In [33]:
csv_url = "https://raw.githubusercontent.com/HongjinZhu/Data-Analysis-Project2/main/movieReplicationSet.csv"
df = pd.read_csv(csv_url)
df

Unnamed: 0,The Life of David Gale (2003),Wing Commander (1999),Django Unchained (2012),Alien (1979),Indiana Jones and the Last Crusade (1989),Snatch (2000),Rambo: First Blood Part II (1985),Fargo (1996),Let the Right One In (2008),Black Swan (2010),...,When watching a movie I cheer or shout or talk or curse at the screen,When watching a movie I feel like the things on the screen are happening to me,As a movie unfolds I start to have problems keeping track of events that happened earlier,"The emotions on the screen ""rub off"" on me - for instance if something sad is happening I get sad or if something frightening is happening I get scared",When watching a movie I get completely immersed in the alternative reality of the film,Movies change my position on social economic or political issues,When watching movies things get so intense that I have to stop watching,Gender identity (1 = female; 2 = male; 3 = self-described),Are you an only child? (1: Yes; 0: No; -1: Did not respond),Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)
0,,,4.0,,3.0,,,,,,...,1.0,6.0,2.0,5.0,5.0,5.0,1.0,1.0,0,1
1,,,1.5,,,,,,,,...,3.0,1.0,1.0,6.0,5.0,3.0,2.0,1.0,0,0
2,,,,,,,,,,,...,5.0,4.0,3.0,5.0,5.0,4.0,4.0,1.0,1,0
3,,,2.0,,3.0,,,,,4.0,...,3.0,1.0,1.0,4.0,5.0,3.0,1.0,1.0,0,1
4,,,3.5,,0.5,,0.5,1.0,,0.0,...,2.0,3.0,2.0,5.0,6.0,4.0,4.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,,,,,3.5,,,,,,...,3.0,4.0,3.0,5.0,5.0,4.0,4.0,1.0,0,0
1093,3.0,4.0,,,4.0,4.0,2.5,,3.5,3.5,...,5.0,3.0,5.0,5.0,5.0,6.0,5.0,1.0,0,0
1094,,,,,,,,3.5,,,...,6.0,3.0,1.0,6.0,6.0,4.0,2.0,1.0,0,0
1095,,,,,,,,,,,...,1.0,1.0,1.0,4.0,3.0,3.0,1.0,1.0,0,1


#### 3)

In [34]:
bottom_10_indices = [80, 95, 9, 55, 190, 319, 292, 41, 14, 248]
top_10_indices = [300, 334, 287, 282, 240, 249, 395, 377, 208, 203]

# select 30 random movies in the middle
exclude_columns = df.columns[bottom_10_indices + top_10_indices]
available_columns = df.iloc[:, :400].columns.difference(exclude_columns)
selected_columns = np.random.choice(available_columns, size=30, replace=False)
selected_movies_df = df[selected_columns]

In [35]:
# pick 10 other movies as input
col_pool = available_columns.difference(selected_columns)
other_movies_indices = np.random.choice(col_pool, size=10, replace=False)
other_movies_data = df[other_movies_indices]
input_data = other_movies_data

In [36]:
# fill all the NaN with median rating of each movie
selected_movies_df = selected_movies_df.apply(lambda col: col.fillna(col.median()), axis=0)
input_data = input_data.apply(lambda col: col.fillna(col.median()), axis=0)

In [39]:
RMSE = []
X = input_data.values

for m in selected_movies_df:
  y = selected_movies_df[m].values
  # train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

  # ridge regression with hyperparameter tuning
  param_grid = {'alpha': [1e-3, 1e-2, 1e-1, 1, 10, 100]}
  grid_search = GridSearchCV(Ridge(), param_grid, scoring='neg_mean_squared_error', cv=5)
  grid_search.fit(X_train, y_train)

  best_alpha = grid_search.best_params_['alpha']
  best_model = grid_search.best_estimator_

  # evaluate the model
  predictions = best_model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, predictions))
  RMSE.append(rmse)

# display the RMSE for each of the 30 movies
results_r = pd.DataFrame({'Movie': selected_movies_df.columns, 'RMSE': RMSE})
results_r

Unnamed: 0,Movie,RMSE
0,Harry Potter and the Deathly Hallows: Part 2 (...,0.702303
1,Along Came a Spider (2002),0.188392
2,Let the Right One In (2008),0.355669
3,Unforgiven (1992),0.251439
4,Friday the 13th Part III (1982),0.422909
5,Meet the Parents (2000),0.427213
6,Batman (1989),0.581949
7,Good Will Hunting (1997),0.59291
8,King Kong (1976),0.598366
9,Point Break (1991),0.27101


#### 4)

In [40]:
RMSE = []
X = input_data.values

for m in selected_movies_df:
  y = selected_movies_df[m].values
  # train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

  # lasso regression with hyperparameter tuning
  param_grid = {'alpha': [1e-3, 1e-2, 1e-1, 1, 10, 100]}
  grid_search = GridSearchCV(Lasso(), param_grid, scoring='neg_mean_squared_error', cv=5)
  grid_search.fit(X_train, y_train)

  best_alpha = grid_search.best_params_['alpha']
  best_model = grid_search.best_estimator_

  # evaluate the model
  predictions = best_model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, predictions))
  RMSE.append(rmse)

# display the RMSE for each of the 30 movies
results_l = pd.DataFrame({'Movie': selected_movies_df.columns, 'RMSE': RMSE})
results_l

Unnamed: 0,Movie,RMSE
0,Harry Potter and the Deathly Hallows: Part 2 (...,0.706013
1,Along Came a Spider (2002),0.187935
2,Let the Right One In (2008),0.349959
3,Unforgiven (1992),0.253301
4,Friday the 13th Part III (1982),0.422263
5,Meet the Parents (2000),0.423848
6,Batman (1989),0.584162
7,Good Will Hunting (1997),0.588792
8,King Kong (1976),0.601996
9,Point Break (1991),0.270717


#### 5)