In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('data/YearPredictionMSD.txt', delimiter=',')  # Adjust delimiter if necessary
print(data.head())

# Split the data
train_data = data.iloc[:463715]
test_data = data.iloc[-51630:]

# Sample a quarter of the training data
sampled_train_data = train_data.sample(frac=0.25, random_state=42)
print(sampled_train_data.shape)

# Sample a quarter of the testing data
sampled_test_data = test_data.sample(frac=0.25, random_state=42)
print(sampled_test_data.shape)

# Separate features and target for the sampled training data
X_sampled_train = sampled_train_data.iloc[:, 1:]  # All columns except the first one
y_sampled_train = sampled_train_data.iloc[:, 0]   # The first column

# Separate features and target for the sampled test data
X_sampled_test = sampled_test_data.iloc[:, 1:]    # All columns except the first one
y_sampled_test = sampled_test_data.iloc[:, 0]     # The first column

print(X_sampled_train.shape, y_sampled_train.shape)
print(X_sampled_test.shape, y_sampled_test.shape)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, 
                                   n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the model on the sampled training data
random_search.fit(X_sampled_train, y_sampled_train)

# Print the best parameters
print(f'Best Parameters: {random_search.best_params_}')

# Predict on the sampled test set
y_pred = random_search.predict(X_sampled_test)

# Calculate the mean squared error
mse = mean_squared_error(y_sampled_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate the R² score
r2 = r2_score(y_sampled_test, y_pred)
print(f'R² Score: {r2}')

# Plot the actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_sampled_test, y_pred, alpha=0.5)
plt.plot([min(y_sampled_test.min(), y_pred.min()), max(y_sampled_test.max(), y_pred.max())], 
         [min(y_sampled_test.min(), y_pred.min()), max(y_sampled_test.max(), y_pred.max())], 'k--', lw=2)
plt.xlabel('Actual Year')
plt.ylabel('Predicted Year')
plt.title('Actual vs Predicted Year')
plt.show()