In [7]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.cross_decomposition import PLSRegression
from scipy.stats import f
from scipy import stats
from scipy.stats import shapiro

import statsmodels.api as sm
import seaborn as sns
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm

import matplotlib.pyplot as plt
import os
import json

**FUNZIONI PER VALUTARE LE PREDIZIONI SUL TEST SET**

In [8]:
def evaluate_predictions(X_test, y_test, y_pred):
    # R-squared (R²)
    r2 = r2_score(y_test, y_pred)

    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, y_pred)

    # Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)

    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    # Residual Standard Error (RSE)
    # Residuals are the differences between the true values and the predictions
    residuals = y_test - y_pred
    rse = np.sqrt(np.sum(residuals**2) / (len(y_test) - 2))  # For simple linear regression, degrees of freedom = n - 2

    return r2, mae, mse, rmse, mape, rse


def print_evaluation_metrics(r2, mae, mse, rmse, mape, rse):
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")
    print(f"Residual Standard Error (RSE): {rse:.4f}")


# Function to compute TP, TN, FP, FN
def compute_confusion_matrix(boolean_predictions, boolean_ground_truth):
    TP = np.sum((boolean_predictions == True) & (boolean_ground_truth == True))   # Both True
    TN = np.sum((boolean_predictions == False) & (boolean_ground_truth == False)) # Both False
    FP = np.sum((boolean_predictions == True) & (boolean_ground_truth == False))  # Predicted True, but False in ground truth
    FN = np.sum((boolean_predictions == False) & (boolean_ground_truth == True))  # Predicted False, but True in ground truth
    return TP, TN, FP, FN


def compute_classification_metrics(TP, TN, FP, FN):
    """
    Computes accuracy, recall, and F1 score.

    Args:
      TP: True positives.
      TN: True negatives.
      FP: False positives.
      FN: False negatives.

    Returns:
      A tuple containing accuracy, recall, and F1 score.
    """
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return accuracy, recall, precision, f1_score

In [9]:
cvs_source= 'Holstein_diary_cows.csv'
df = pd.read_csv(cvs_source)

**Eseguire la prossima cella per visualizzare il dataset**

In [None]:
# Create the scatter plot
plt.scatter(df['Glu'], df['BHB'])

# Set labels and title
plt.xlabel('Glu (mmol/L)')
plt.ylabel('BHB (mmol/L)')
plt.title('Scatter plot: Glu vs BHB')

# Add a legend
plt.legend()

# Display the plot
plt.show()

**DIVIDI IL DATASET IN 10 DIVERSI SPLIT CIASCUNO 80% TRAIN E 20% TEST**

In [11]:
X = df[['Glu']]  # Features (independent variable)
y = df['BHB']    # Target (dependent variable)

df['Glu_bin'] = 0

min_val = df['Glu'].min()
max_val = df['Glu'].max()
delta = abs(max_val-min_val)

# Iterate over rows and assign bin values based on conditions
for idx, row in df.iterrows():
    if min_val <= row['Glu'] < min_val+(1.0/6.0)*delta:
        df.loc[idx, 'Glu_bin'] = 1
    elif min_val+(1.0/6.0)*delta <= row['Glu'] < min_val+(2.0/6.0)*delta:
        df.loc[idx, 'Glu_bin'] = 2
    elif min_val+(2.0/6.0)*delta <= row['Glu'] < min_val+(3.0/6.0)*delta:
        df.loc[idx, 'Glu_bin'] = 3
    elif min_val+(3.0/6.0)*delta <= row['Glu'] < min_val+(4.0/6.0)*delta:
        df.loc[idx, 'Glu_bin'] = 4
    elif min_val+(4.0/6.0)*delta <= row['Glu'] < min_val+(5.0/6.0)*delta:
        df.loc[idx, 'Glu_bin'] = 5
    elif min_val+(5.0/6.0)*delta <= row['Glu'] <= max_val:
        df.loc[idx, 'Glu_bin'] = 6

stratifier = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Stratify based on 'energy_bin' column
strat_column = df['Glu_bin']

splits = []
for train_index, test_index in stratifier.split(X, strat_column):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    splits.append((X_train, X_test, y_train, y_test))

**CREA, ALLENA ED APPLICA UN MODELLO DI RANDOM FOREST REGRESSION SU CIASCUNO SPLIT**

In [12]:
predictions = []
for X_train, X_test, y_train, y_test in splits:
    y_test = y_test.to_numpy().reshape(-1)
    model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions.append((X_test, y_test, y_pred))
    del model

**VALUTA LE PREDIZIONI MEDIE**

In [None]:
threshold = 1.2  # The threshold value

rows = []
for X_test, y_test, y_pred in predictions:

    if not isinstance(y_test, np.ndarray):
        y_test = y_test.values

    r2, mae, mse, rmse, mape, rse = evaluate_predictions(X_test, y_test, y_pred)

    # Create boolean arrays
    bool_y_test = y_test >= threshold
    bool_y_pred = y_pred >= threshold

    # Compute metrics for each prediction array
    TP, TN, FP, FN = compute_confusion_matrix(bool_y_pred, bool_y_test)
    accuracy, recall, precision, f1_score = compute_classification_metrics(TP, TN, FP, FN)

    row = {
        'R2': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'MAPE': mape,
        'RSE': rse,
        'TP': TP,
        'TN': TN,
        'FP': FP,
        'FN': FN,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1': f1_score
    }
    rows.append(row)

results_df = pd.DataFrame(rows)
results_df.index.name = 'Split'
# Compute descriptive statistics for each column
descriptive_stats = results_df.agg(['min', 'max', 'mean', 'std'])

# Display the results
print(descriptive_stats)