# **1. Model Training**

In this section, we will implement and train three different types of classifiers to .

In [None]:
%pip install 

In [1]:
import pandas as pd

# Load Training and Test Datasets
train_data = pd.read_csv('datasets/train_preprocessed.csv')
test_data  = pd.read_csv('datasets/test_preprocessed.csv')

# Load Training and Test Features
features_train = pd.read_csv('features/features_train.csv')
features_test  = pd.read_csv('features/features_test.csv')

## 1.1. Linear Regression

The first model we implemented is the Linear Regression model. We selected this model due to it was used in the UKP paper [1]. We followed their same "process" using a 10-fold cross-validation.

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr
import numpy as np

pearson_scorer = make_scorer(pearson_corr, greater_is_better=True)

# Linear Regression Model
linear_model = LinearRegression()

# Cross-validation
linear_scores = cross_val_score(linear_model, features_train, train_data["score"], cv=10, scoring=pearson_scorer)

print("Linear Regression - Pearson Correlation Scores:", linear_scores)
print("Linear Regression - Mean Pearson Correlation:", linear_scores.mean())

NameError: name 'pearson_corr' is not defined

## 1.2. Random Forest Regressor

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Random Forest Parameters
rf_params = {
    "n_estimators": [100, 150],
    "max_depth": [5, 10], #None crec que funciona millor
    "min_samples_split": [2, 5, 7]
}

# Random Forest Model
rf_model = RandomForestRegressor(random_state=0)

# Grid Search for Random Forest
rf_grid = GridSearchCV(rf_model, rf_params, cv=10, scoring=pearson_scorer, n_jobs=-1)
rf_grid.fit(features_train, train_data["score"])

# Best Model and Parameters
rf_best_model = rf_grid.best_estimator_
rf_best_params = rf_grid.best_params_

# Cross-validation with the best model
rf_best_scores = cross_val_score(rf_best_model, features_train, train_data["score"], cv=10, scoring=pearson_scorer)

print("Random Forest - Best Parameters:", rf_best_params)
print("Random Forest - Pearson Correlation Scores:", rf_best_scores)
print("Random Forest - Mean Pearson Correlation:", rf_best_scores.mean())

NameError: name 'pearson_scorer' is not defined

## 1.3. Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Parameters
gb_params = {
    "learning_rate": [0.05, 0.1],
    "n_estimators": [100, 150],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5],
    "subsample": [0.5, 1.0]
}

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(random_state=0)

# Grid Search for Gradient Boosting
gb_grid = GridSearchCV(gb_model, gb_params, cv=10, scoring=pearson_scorer, n_jobs=-1)
gb_grid.fit(features_train, train_data["score"])

# Best Model and Parameters
gb_best_model = gb_grid.best_estimator_
gb_best_params = gb_grid.best_params_

# Cross-validation with the best model
gb_best_scores = cross_val_score(gb_best_model, features_train, train_data["score"], cv=10, scoring=pearson_scorer)

print("Gradient Boosting - Best Parameters:", gb_best_params)
print("Gradient Boosting - Pearson Correlation Scores:", gb_best_scores)
print("Gradient Boosting - Mean Pearson Correlation:", gb_best_scores.mean())

## **2. Model Selection**

In [None]:
models_summary = {
    "Linear Regression": {"Mean Pearson CV Score": linear_scores.mean()},
    "Random Forest": {"Mean Pearson CV Score": rf_best_scores.mean()},
    "Gradient Boosting": {"Mean Pearson CV Score": gb_best_scores.mean()},
}

models_df = pd.DataFrame(models_summary).T
print(models_df)

best_model_name = models_df["Mean Pearson CV Score"].idxmax()
best_model_score = models_df["Mean Pearson CV Score"].max()

print(f"Best Model: {best_model_name} with a Mean Pearson CV Score of {best_model_score}")
