In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer

# Import any additional modules and start coding below
data = pd.read_csv('rental_info.csv')

data['rental_length_days'] = (pd.to_datetime(data['return_date']) - pd.to_datetime(data['rental_date'])).dt.days

data['deleted_scenes'] = data['special_features'].apply(lambda x: int('Deleted Scenes' in x))
data['behind_the_scenes'] = data['special_features'].apply(lambda x: int('Behind the Scenes' in x))

features_to_keep = ['release_year', 'amount', 'rental_rate', 'length', 'replacement_cost', 'deleted_scenes', 'behind_the_scenes', 'NC-17', 'PG', 'PG-13', 'R']

X = data.drop(["rental_date", 'return_date', 'rental_length_days', "special_features"], axis=1)
print(X.columns)
y = data['rental_length_days']


In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)


lasso = Lasso(alpha = 0.05, random_state = 9)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(lasso.coef_)

coeffs = lasso.coef_
mask = np.abs(coeffs) > 0
selected_features = X.columns[mask]
print(f'Selected features: {selected_features}')



In [None]:
best_model = RandomForestRegressor(random_state=9, n_estimators = 200, min_samples_split = 2, min_samples_leaf=1, max_depth=20)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
best_mse = mean_squared_error(y_pred, y_test)


