In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Load training data
x_train = pd.read_csv("./pc_X_train.csv")
y_train = pd.read_csv("./pc_Y_train.csv")

# Convert 'score' column to object type
y_train.score = y_train.score.astype(object)

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Extract the 'score' column as the target variable
y_train = y_train["score"]
y_val = y_val["score"]

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# Model selection and tuning (Random Forest in this case)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

random_forest_model = RandomForestRegressor(random_state=42)
print("Before Grid Search")
grid_search = GridSearchCV(random_forest_model, param_grid, cv=5)
print("Before Fit")
grid_search.fit(x_train_scaled, y_train)
print("After Fit")
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(x_val_scaled)

# Calculate RMSE on the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Root Mean Squared Error on Validation Set:", rmse_val)

# Load the test data
x_test = pd.read_csv("./pc_X_test.csv")

# Standardize the test features using the same scaler from the training set
x_test_scaled = scaler.transform(x_test)

# Make predictions on the test set
y_test_pred = best_model.predict(x_test_scaled)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'Id': x_test.index, 'score': y_test_pred})
predictions_df.to_csv('pc_y_test_predicted_rf.csv', index=False)
