# **1. Model Training**

In this section, we will implement and train three different types of classifiers to .

In [None]:
%pip install 

In [91]:
import pandas as pd

# Load Training and Test Datasets
train_data = pd.read_csv('datasets/train_preprocessed.csv')
test_data  = pd.read_csv('datasets/test_preprocessed.csv')

# Load Training and Test Features
features_train = pd.read_csv('features/features_train2.csv')
features_test  = pd.read_csv('features/features_test2.csv')

In [92]:
from sklearn.utils import shuffle

features_train_shuffled, score_shuffled = shuffle(features_train, train_data["score"], random_state=42)

In [93]:
from sklearn.preprocessing import PowerTransformer

power_transformer = PowerTransformer(method='yeo-johnson')

for col in ["longest_common_substring", "longest_common_subsequence", "greedy_string_tiling",'3_gram_word_Jaccard', '4_gram_word_Jaccard', '2_gram_word_Jaccard_without_SW', '2_gram_word_Jaccard_without_SW', "pathlen_similarity", "lin_similarity"]:
    features_train[col] = np.log1p(features_train[col])


features_train[["2_gram_char", "lexical_substitution_system"]] = power_transformer.fit_transform(features_train[["2_gram_char", "lexical_substitution_system"]])

## 1.1. Linear Regression

The first model we implemented is the Linear Regression model. We selected this model due to it was used in the UKP paper [1]. We followed their same "process" using a 10-fold cross-validation.

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr
import numpy as np

def pearson_corr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

pearson_scorer = make_scorer(pearson_corr, greater_is_better=True)

# Linear Regression Model
linear_model = LinearRegression()
# Cross-validation
linear_scores = cross_val_score(linear_model, features_train_shuffled, score_shuffled, cv=10, scoring=pearson_scorer)

print("Linear Regression - Pearson Correlation Scores:", linear_scores)
print("Linear Regression - Mean Pearson Correlation:", linear_scores.mean())

Linear Regression - Pearson Correlation Scores: [0.77835537 0.7667178  0.78716194 0.85520775 0.76680148 0.81712407
 0.76953242 0.79999424 0.84274563 0.83556031]
Linear Regression - Mean Pearson Correlation: 0.8019201003257639


## 1.2. Random Forest Regressor

In [94]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Random Forest Parameters
rf_params = {
    "n_estimators": [100, 200, 400],
    "max_depth": [15, 20, 25], #None crec que funciona millor
    "min_samples_split": [2, 5]
}

# Random Forest Model
rf_model = RandomForestRegressor(random_state=0)

# Grid Search for Random Forest
rf_grid = GridSearchCV(rf_model, rf_params, cv=10, scoring=pearson_scorer, n_jobs=-1)
rf_grid.fit(features_train_shuffled, score_shuffled)

# Best Model and Parameters
rf_best_model = rf_grid.best_estimator_
rf_best_params = rf_grid.best_params_

# Cross-validation with the best model
rf_best_scores = cross_val_score(rf_best_model, features_train_shuffled, score_shuffled, cv=10, scoring=pearson_scorer)

print("Random Forest - Best Parameters:", rf_best_params)
print("Random Forest - Pearson Correlation Scores:", rf_best_scores)
print("Random Forest - Mean Pearson Correlation:", rf_best_scores.mean())

Random Forest - Best Parameters: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest - Pearson Correlation Scores: [0.82106607 0.83467509 0.8530833  0.8896091  0.81747197 0.88078393
 0.8341949  0.84349109 0.89520229 0.88602622]
Random Forest - Mean Pearson Correlation: 0.8555603961216758


## 1.3. Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Parameters
gb_params = {
    "learning_rate": [0.05, 0.1],
    "n_estimators": [100, 150],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5],
    "subsample": [0.5, 1.0]
}

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(random_state=0)

# Grid Search for Gradient Boosting
gb_grid = GridSearchCV(gb_model, gb_params, cv=10, scoring=pearson_scorer, n_jobs=-1)
gb_grid.fit(features_train, train_data["score"])

# Best Model and Parameters
gb_best_model = gb_grid.best_estimator_
gb_best_params = gb_grid.best_params_

# Cross-validation with the best model
gb_best_scores = cross_val_score(gb_best_model, features_train, train_data["score"], cv=10, scoring=pearson_scorer)

print("Gradient Boosting - Best Parameters:", gb_best_params)
print("Gradient Boosting - Pearson Correlation Scores:", gb_best_scores)
print("Gradient Boosting - Mean Pearson Correlation:", gb_best_scores.mean())

Gradient Boosting - Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}
Gradient Boosting - Pearson Correlation Scores: [0.57324476 0.56447931 0.51679637 0.70493078 0.79600003 0.77431305
 0.90360158 0.7544465  0.67282511 0.64644695]
Gradient Boosting - Mean Pearson Correlation: 0.6907084439881572


## **2. Model Selection**

In [45]:
models_summary = {
    "Linear Regression": {"Mean Pearson CV Score": linear_scores.mean()},
    "Random Forest": {"Mean Pearson CV Score": rf_best_scores.mean()},
    "Gradient Boosting": {"Mean Pearson CV Score": gb_best_scores.mean()},
}

models_df = pd.DataFrame(models_summary).T
print(models_df)

best_model_name = models_df["Mean Pearson CV Score"].idxmax()
best_model_score = models_df["Mean Pearson CV Score"].max()

print(f"Best Model: {best_model_name} with a Mean Pearson CV Score of {best_model_score}")


                   Mean Pearson CV Score
Linear Regression               0.801920
Random Forest                   0.850097
Gradient Boosting               0.690708
Best Model: Random Forest with a Mean Pearson CV Score of 0.8500973394498722


In [None]:
model = rf_best_model

In [95]:
for col in ["longest_common_substring", "longest_common_subsequence", "greedy_string_tiling",'3_gram_word_Jaccard', '4_gram_word_Jaccard', '2_gram_word_Jaccard_without_SW', '2_gram_word_Jaccard_without_SW', "pathlen_similarity", "lin_similarity"]:
    features_test[col] = np.log1p(features_test[col])

features_test[["2_gram_char", "lexical_substitution_system"]] = power_transformer.transform(features_test[["2_gram_char", "lexical_substitution_system"]])

In [96]:
y_true = test_data["score"]

y_pred = model.predict(features_test)
pearson_corr = pearsonr(y_true, y_pred)[0]
print("Pearson Correlation:", pearson_corr)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- wordnet_augmented_overlap


# **. Comparison with Official Results**

In [None]:
# Obtain Official Results
official_results = pd.read_csv('results/official_results.csv')

# Calculate Our Best Results

# Add Our Best Results to the Official Results

# Compare