In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from scipy.stats import f

import statsmodels.api as sm
import seaborn as sns

import matplotlib.pyplot as plt
import os

**FUNZIONI PER VALUTARE LE PREDIZIONI SUL TEST SET**

In [None]:
def evaluate_predictions(X_test, y_test, y_pred):
    # R-squared (R²)
    r2 = r2_score(y_test, y_pred)

    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, y_pred)

    # Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)

    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    # Residual Standard Error (RSE)
    # Residuals are the differences between the true values and the predictions
    residuals = y_test - y_pred
    rse = np.sqrt(np.sum(residuals**2) / (len(y_test) - 2))  # For simple linear regression, degrees of freedom = n - 2

    # Output all the results
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")
    print(f"Residual Standard Error (RSE): {rse:.4f}")
    # Create the scatter plot
    plt.scatter(X_test, y_test, color='blue', marker='o', label='ground truth BHB values')
    plt.scatter(X_test, y_pred, color='red', marker='+', label='predicted BHB values')
    plt.axhline(y=1.2, color='green', linestyle='--', linewidth=2, label='Threshold per diagnosi')
    # Set labels and title
    plt.xlabel('Glu (mmol/L)')
    plt.ylabel('BHB (mmol/L)')
    plt.title('Scatter plot: ground truth and predictions on test set')

    # Add a legend
    plt.legend()

    # Display the plot
    plt.show()

    # Create a histogram
    plt.figure(figsize=(8, 6))  # Adjust figure size if needed
    sns.histplot(residuals, kde=True, bins=30)  # kde=True adds a kernel density estimate
    plt.title('Distribuzione dei residui')
    plt.xlabel('Residuai')
    plt.ylabel('Frequenza')
    plt.show()

    # Create a Q-Q plot (optional)
    sm.qqplot(residuals, line='45', fit=True)
    plt.title('Q-Q Plot dei residui')
    plt.show()


# Function to compute TP, TN, FP, FN
def compute_confusion_matrix(boolean_predictions, boolean_ground_truth):
    TP = np.sum((boolean_predictions == True) & (boolean_ground_truth == True))   # Both True
    TN = np.sum((boolean_predictions == False) & (boolean_ground_truth == False)) # Both False
    FP = np.sum((boolean_predictions == True) & (boolean_ground_truth == False))  # Predicted True, but False in ground truth
    FN = np.sum((boolean_predictions == False) & (boolean_ground_truth == True))  # Predicted False, but True in ground truth
    return TP, TN, FP, FN


def compute_classification_metrics(TP, TN, FP, FN):
  """
  Computes accuracy, recall, and F1 score.

  Args:
    TP: True positives.
    TN: True negatives.
    FP: False positives.
    FN: False negatives.

  Returns:
    A tuple containing accuracy, recall, and F1 score.
  """
  accuracy = (TP + TN) / (TP + TN + FP + FN)
  recall = TP / (TP + FN)
  precision = TP / (TP + FP)
  f1_score = 2 * (precision * recall) / (precision + recall)

  return accuracy, recall, precision, f1_score

In [None]:
cvs_source= 'Holstein_diary_cows.csv'
df = pd.read_csv(cvs_source)

**Eseguire la prossima cella per visualizzare il dataset**

In [None]:
# Create the scatter plot
plt.scatter(df['Glu'], df['BHB'])

# Set labels and title
plt.xlabel('Glu (mmol/L)')
plt.ylabel('BHB (mmol/L)')
plt.title('Scatter plot: Glu vs BHB')

# Add a legend
plt.legend()

# Display the plot
plt.show()

**DIVIDI IL DATASET IN 80% TRAIN E 20% TEST**

In [None]:
X = df[['Glu']]  # Features (independent variable)
y = df['BHB']    # Target (dependent variable)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**CREA ED ALLENA UN MODELLO DI RANDOM FOREST REGRESSION**

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters
model.fit(X_train, y_train)

**APPLICA IL MODELLO SUL TEST SET E VALUTA LE SUE PREDIZIONI**

In [None]:
y_pred = model.predict(X_test)
evaluate_predictions(X_test, y_test, y_pred)

if not isinstance(y_test, np.ndarray):
  y_test = y_test.values

threshold = 1.2  # The threshold value

# Create boolean arrays
bool_y_test = y_test >= threshold
bool_y_pred = y_pred >= threshold

# Compute metrics for each prediction array
TP, TN, FP, FN = compute_confusion_matrix(bool_y_pred, bool_y_test)
accuracy, recall, precision, f1_score = compute_classification_metrics(TP, TN, FP, FN)

print("Matrice di confusione:")
print(f"TP: {TP}")
print(f"TN: {TN}")
print(f"FP: {FP}")
print(f"FN: {FN}")
print("Metriche del classificatore:")
print(f"Accuratezza: {accuracy}")
print(f"Richiamo: {recall}")
print(f"Precisione: {precision}")
print(f"F1: {f1_score}")