# Imports

In [1]:
import string

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

In [None]:
from scipy.stats import randint, uniform

# Functions

In [None]:
def inspect_df(df : pd.DataFrame):
    print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')
    null_values = df.isna().sum()
    if (null_values.sum() == 0):
        print('Null values: 0')
    else:
        print('Null values:')
        print(null_values)

In [None]:
def check_balance(df: pd.DataFrame, target_col: str):
    value_counts = (df[target_col].value_counts(normalize=True) * 100).round(2)
    ideal_share = 100 / len(value_counts)
    
    print(f'Value counts for {target_col}:')
    print(value_counts)
    print("-" * 30)
    print(f'The dataset is balanced if the value counts are close to {ideal_share:.2f}%')

In [None]:
def evaluate_classifier(model, x_test, y_test, title: str, binary=True):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Determiniamo l'average in base al parametro binary
    avg_type = 'binary' if binary else 'weighted'
    f1 = f1_score(y_test, y_pred, average=avg_type)

    print(f"\n{title.title()}:")
    print(f"Accuracy:           {accuracy:.4f}") # Formattazione a 4 decimali per pulizia
    print(f"F1 score ({avg_type}): {f1:.4f}")
    
    # Usiamo from_predictions perché abbiamo già y_pred
    disp = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, 
        cmap=plt.cm.Blues, 
        normalize='true' # 'true' normalizza sulle righe (valori reali)
    )
    
    disp.ax_.set_title(f"Confusion Matrix: {title.title()}")
    plt.show()

In [None]:
def evaluate_regressor(model, x_test, y_test, binary=True):
    y_pred = model.predict(x_test)
    y_pred = np.abs(y_pred)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    if (binary):
        msle = mean_squared_log_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("")
    print(f"MAE:    {mae}")
    print(f"MSE:    {mse}")
    if (binary):    
        print(f"MSLE:   {msle}")
    print(f"R2:     {r2}")
    print("")
    print("Remember: R2 score is in [-inf, 1]. R2<0 --> BAD MODEL.")

In [None]:
def get_categorical_features(df : pd.DataFrame):
    return df.select_dtypes(include=['object']).columns

In [None]:
def get_k_correlated(df : pd.DataFrame, y_name, k=5):
    y_corr = df.corr()[y_name].abs().sort_values(ascending=False)
    return y_corr[1:(k+1)].index.tolist()

In [None]:
def clean_text(text : str):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text

# Utils

In [None]:
"""
------ Cross-Validation
cv = cross_val_score(model, X, y, cv=5, scoring='accuracy')`
print(f"\nMedia Accuracy 5-Fold CV: {cv.mean():.4f}")`
#cv è il numero di fold, accuracy la metrica da valutare.
#Confronto: La CV è solitamente più rappresentativa della Confusion Matrix perché testa il modello su diverse porzioni del dataset, riducendo il rischio di overfitting su uno specifico split di test.


------ Train-Test split
X = dataset.drop('target', axis=1).drop(get_categorical_features(dataset), axis=1)
y = dataset['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/5, random_state=0
)


------ Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


------ Allenare Logistic Regression
log_reg = LogisticRegression(max_iter=1000,random_state=42)
log_reg.fit(X_train, y_train)


------- Allenare Decision Tree
dec_tree = DecisionTreeClassifier(random_state=42)  
dec_tree.fit(X_train, y_train)

"""

# Exam