# Imports

In [None]:
import string

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, OneHotEncoder, StandardScaler, MaxAbsScaler, Normalizer, OrdinalEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD

In [None]:
from scipy.stats import randint, uniform

# Functions

In [None]:
def inspect_df(df : pd.DataFrame):
    """
    Display basic structural information about a DataFrame.

    This function prints the number of rows and columns, checks for
    missing values, and displays the data types of each feature.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to be inspected.

    Returns
    -------
    None
        This function does not return any value. It prints
        summary information to the console.

    Notes
    -----
    - If no missing values are present, the function explicitly
      prints that there are zero null values.
    - If missing values exist, the function prints the number
      of null values for each column.
    - Feature data types are displayed using `df.dtypes`.
    """
    print(f'Rows:       {df.shape[0]}')
    print(f'Columns:    {df.shape[1]}')
    print("-" * 30)

    null_values = df.isna().sum()
    if (null_values.sum() == 0):
        print('Null values: 0')
    else:
        print('Null values:')
        print(null_values)
    
    print("-" * 30)
    print('Feature data types:')
    print(df.dtypes)
    print("")

In [None]:
def check_balance(df : pd.DataFrame, target_col : str):
    """
    Check the class distribution of a target column in a dataset.

    This function computes the percentage distribution of each class
    in the specified target column and prints the results. It also
    displays the ideal percentage share per class for a perfectly
    balanced dataset.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing the dataset.
    target_col : str
        Name of the target column whose class distribution
        needs to be evaluated.

    Returns
    -------
    None
        This function does not return any value. It prints
        the class distribution and balance information.

    Notes
    -----
    - Percentages are computed using normalized value counts
      and rounded to two decimal places.
    - The dataset can be considered balanced if the class
      percentages are close to the ideal equal share
      (100 / number_of_classes).
    - The function assumes that `target_col` exists in `df`.
    """
    value_counts = (df[target_col].value_counts(normalize=True) * 100).round(2)
    ideal_share = 100 / len(value_counts)
    
    print(f'Value counts for {target_col}:')
    print(value_counts)
    print("-" * 30)
    print(f'The dataset is balanced if the value counts are close to {ideal_share:.2f}%')

In [None]:
def evaluate_classifier(
        model, 
        x_test : pd.DataFrame, 
        y_test : pd.Series, 
        title: str, 
        f1_average: str = 'binary', 
        f1_pos_label = 1
    ):
    """
    Evaluate the performance of a classification model on a test dataset.

    This function computes Accuracy and F1-score, prints the results,
    and displays the confusion matrix normalized by rows.

    Parameters
    ----------
    model : estimator
        A trained classification model implementing the `predict` method.
    x_test : array-like
        Feature matrix of the test dataset.
    y_test : array-like
        True labels corresponding to `x_test`.
    title : str
        Descriptive title to display in the output and on the confusion matrix plot.
    f1_average : str, default='binary'
        Averaging method used to compute the F1-score.
        Common options include: 'binary', 'micro', 'macro', 'weighted'.
    f1_pos_label : int or str, default=1
        The label considered as the positive class when `f1_average='binary'`.

    Returns
    -------
    y_pred : array-like
        Predicted labels generated by the model on `x_test`.

    Notes
    -----
    - The confusion matrix is displayed normalized by rows (`normalize='true'`),
      meaning values represent proportions relative to the true class.
    - Metrics are printed with four decimal places.
    """
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average=f1_average, pos_label=f1_pos_label)

    print("")
    print(f"{title.title()}:")
    print(f"Accuracy:                   {accuracy:.4f}")
    print(f"F1 score ({f1_average}):        {f1:.4f}")

    disp = ConfusionMatrixDisplay.from_predictions(
        y_pred, y_test,
        cmap=plt.cm.Blues, 
        normalize='true'    # 'true' normalize on rows
    )
    disp.ax_.set_title(title.title())
    plt.show()

    return y_pred

In [None]:
def evaluate_regressor(model, x_test : pd.DataFrame, y_test : pd.Series, title : str):
    """
    Evaluate the performance of a regression model on a test dataset.

    This function generates predictions using the provided model and
    computes common regression metrics: MAE, MSE, MSLE (if applicable),
    and R² score. The results are printed to the console.

    Parameters
    ----------
    model : estimator
        A trained regression model implementing the `predict` method.
    x_test : array-like
        Feature matrix of the test dataset.
    y_test : array-like
        True target values corresponding to `x_test`.

    Returns
    -------
    y_pred : array-like
        Predicted values generated by the model on `x_test`.

    Notes
    -----
    - MAE (Mean Absolute Error) measures the average absolute difference
      between true and predicted values.
    - MSE (Mean Squared Error) penalizes larger errors more heavily.
    - MSLE (Mean Squared Logarithmic Error) is computed only if all true
      and predicted values are greater than -1. Otherwise, it is not
      applicable.
    - R² score is bounded in the interval (-∞, 1]. Values below 0
      indicate that the model performs worse than a baseline model
      predicting the mean target value.
    """
    y_pred = model.predict(x_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    if (y_test > -1).all() & (y_pred > -1).all():
        msle = mean_squared_log_error(y_test, y_pred)
    else:
        msle = "N/A (y_test contiene valori <= -1)"
    r2 = r2_score(y_test, y_pred)

    print(f"{title.title()}:")
    print(f"MAE:    {mae}")
    print(f"MSE:    {mse}")
    print(f"MSLE:   {msle}")
    print(f"R2:     {r2}")
    print("")
    print("Remember: R2 score is in [-inf, 1]. R2<0 --> BAD MODEL.")
    print("")

    return y_pred

In [None]:
def kfold_crossvalidation(
        model, 
        x_train : pd.DataFrame, 
        y_train : pd.Series, 
        cv : int, 
        title : str
    ):
    """
    Perform K-Fold cross-validation on a given model.

    This function evaluates a model using cross-validation,
    prints the mean and standard deviation of the obtained
    scores, and returns the full array of cross-validation results.

    Parameters
    ----------
    model : estimator
        A machine learning model implementing the scikit-learn API.
    x_train : array-like
        Feature matrix of the training dataset.
    y_train : array-like
        Target values corresponding to `x_train`.
    cv : int
        Number of folds for K-Fold cross-validation.
        (Note: the current implementation uses `cv=10` internally.)
    title : str
        Descriptive title to display in the printed output.

    Returns
    -------
    outputs : numpy.ndarray
        Array containing the cross-validation scores for each fold.

    Notes
    -----
    - Cross-validation is performed using `cross_val_score`
      with parallel processing enabled (`n_jobs=-1`).
    - The function prints the mean and standard deviation
      of the cross-validation scores.
    """
    outputs = cross_val_score(model, x_train, y_train, cv=cv, n_jobs=-1)

    print(f'{title.title()}:')
    print(f'Mean:   {outputs.mean()}')
    print(f'Std:    {outputs.std()}')
    print('-' * 30)
    return outputs

In [None]:
def get_categorical_features(df : pd.DataFrame):
    return df.select_dtypes(include=['object']).columns

In [None]:
def get_k_correlated(df : pd.DataFrame, y_name : str, k : int, abs : bool, ascending : bool):
    """
    Retrieve the names of the top-k features most correlated with a target variable.

    This function computes the correlation matrix of the DataFrame and
    returns the names of the k features with the highest (or lowest)
    correlation with the specified target column.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing numerical features.
    y_name : str
        Name of the target column for which correlations are computed.
    k : int
        Number of correlated features to retrieve.
    abs : bool
        If True, correlations are sorted by their absolute values.
        If False, raw correlation values are used.
    ascending : bool
        Sorting order of correlations.
        If True, correlations are sorted in ascending order.
        If False, correlations are sorted in descending order.

    Returns
    -------
    pandas.Index
        Index containing the names of the top-k correlated features,
        excluding the target variable itself.

    Notes
    -----
    - The function assumes that `y_name` exists in `df` and that the
      DataFrame contains only numerical columns for correlation computation.
    - The first element (self-correlation of the target) is excluded
      from the result.
    """
    if (abs):
        y_corr = df.corr()[y_name].abs().sort_values(ascending=ascending)
    else:
        y_corr = df.corr()[y_name].sort_values(ascending=ascending)
    return y_corr[1:(k+1)].index

In [None]:
def pfi(
    model,
    features: list[str],
    x_test: pd.DataFrame,
    y_test: pd.Series,
    n_repeats: int
):
    """
    Compute and display Permutation Feature Importance (PFI)
    for a selected subset of features.

    This function calculates permutation importance for all features
    in the test set and then plots the importance (mean and standard
    deviation) only for the specified subset of features.

    Parameters
    ----------
    model : estimator
        A trained machine learning model implementing the scikit-learn API.
    features : list of str
        List of feature names to be displayed in the importance plot.
        These must be present in `x_test.columns`.
    x_test : pandas.DataFrame
        Feature matrix of the test dataset.
    y_test : pandas.Series
        True target values corresponding to `x_test`.
    n_repeats : int
        Number of times each feature is randomly shuffled to compute
        importance scores.

    Returns
    -------
    None
        This function does not return any value. It displays a bar plot
        of permutation feature importances for the selected features.

    Notes
    -----
    - Permutation importance measures the decrease in model performance
      when a feature’s values are randomly shuffled.
    - Higher importance values indicate that the model relies more heavily
      on the corresponding feature.
    - Error bars represent the standard deviation of importance across
      the `n_repeats` shuffles.
    - Parallel computation is enabled (`n_jobs=-1`).
    - The function assumes that all elements in `features` exist in
      `x_test.columns`.
    """
    pfi_result = permutation_importance(model, x_test, y_test, n_repeats=n_repeats, n_jobs=-1)
    
    importance_mean = pd.Series(
        pfi_result['importances_mean'], 
        index=x_test.columns
    )
    importance_std = pd.Series(
        pfi_result['importances_std'], 
        index=x_test.columns
    )
    
    fig, ax = plt.subplots()
    importance_mean[features].plot.bar(yerr=importance_std[features], ax=ax)
    ax.set_title("Permutation Feature Importance")
    ax.set_ylabel('Importance')
    ax.set_xlabel('Features')
    plt.xticks(rotation=45, ha='right')
    fig.tight_layout()
    plt.show()

In [None]:
def clean_text(text : str):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text

# Exam

# Notes

In [None]:
"""
------ Parità di genere
m_mask = x_test[feature_name] == m_label
f_mask = x_test[feature_name] == f_label
xm_test, ym_test = x_test[m_mask], y_test[m_mask]
xf_test, yf_test = x_test[f_mask], y_test[f_mask]
m_pred = evaluate_classifier(model, xm_test, ym_test, 'Male')
f_pred = evaluate_classifier(model, xf_test, yf_test, 'Female')

# !! If prediction is BINARY !!
print(f'Predicted "{y_test.name}" probability:')
print(f'Male:       {((m_pred.sum() / m_pred.size) * 100).round(2)}%')
print(f'Female:     {((f_pred.sum() / f_pred.size) * 100).round(2)}%')

------ Scatter
plt.scatter(x, y)
plt.xlabel(x.label)
plt.ylabel(y.label)
plt.show()

------ Cross-Validation
cv = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"\nMedia Accuracy 5-Fold CV: {cv.mean():.4f}")
#cv è il numero di fold, accuracy la metrica da valutare.
#Confronto: La CV è solitamente più rappresentativa della Confusion Matrix perché testa il modello su diverse porzioni del dataset, riducendo il rischio di overfitting su uno specifico split di test.


------ Cross-Validation e Pipeline
# 1. Creiamo la pipeline: prima scala i dati, poi applica la LR
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', log_reg) # log_reg è il modello definito in precedenza
])
# 2. Passo la pipeline alla cross_val_score invece del singolo modello
cv = cross_val_score(pipeline, X, y, cv=10, scoring='accuracy')
print(f"\nMedia Accuracy 10-Fold CV (con Scaling): {cv.mean():.4f}")


------ Train-Test split
X = dataset.drop('target', axis=1).drop(get_categorical_features(dataset), axis=1)
y = dataset['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/5, random_state=0
)


------ Scaler
# Dovresti preoccuparti dello scaling (e quindi del MaxAbsScaler o StandardScaler) solo quando usi: Logistic Regression (per far convergere il solutore); SVM / KNN (perché si basano sulle distanze); Reti Neurali (per la stabilità del gradiente); PCA (perché si basa sulla varianza).
scaler = StandardScaler()
#oppure
# scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


------ Allenare Logistic Regression
log_reg = LogisticRegression(max_iter=1000,random_state=42)
log_reg.fit(X_train, y_train)


------- Allenare Decision Tree
dec_tree = DecisionTreeClassifier(random_state=42)  
dec_tree.fit(X_train, y_train)


------- Allenare Random Forest
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(X_train, y_train)


------- Allenare Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


------- Grid Search CV
# 1. Definizione dei parametri da testare
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10, 20]
}
# 2. Creazione del classificatore base
dt_base = DecisionTreeClassifier(random_state=0)
# 3. Configurazione della GridSearchCV (cv=10 per coerenza con il punto precedente)
grid_search = GridSearchCV(estimator=dt_base, param_grid=param_grid, cv=10, scoring='accuracy')
# 4. Ricerca sui dati di train
grid_search.fit(X_train, y_train)
# 5. Risultati migliori:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Migliori parametri trovati: {best_params}")
print(f"Accuracy media in CV (Migliore): {best_score:.4f}")

"""