# 

## Device

In [20]:
import torch
device = (
    "mps"
    if getattr(torch, "has_mps", False)
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: cpu


https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_04_1_kfold.ipynb

https://scikit-learn.org/stable/modules/cross_validation.html#

In [1]:
import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]


## EarlyStopping

In [22]:
# Early stopping (see module 3.4)
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

## K-Fold 

K-Fold je metoda rozdělení a následné použití trénovacích dat. 
K-Fold rozděluje data do k skupin. Pro každou skupinu je vytvořen model, který je na této skupině trénován.
Zbylé skupiny jsou pro daný model použity jako testovací množina.

## Iris dataset

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

In [4]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

## Unknown Dataset

In [5]:
serverpath = "https://needtoknow.cz"
document = "/~profesor/data/jh-simple-dataset.csv"

In [6]:
import getpass
server = getpass.getpass()
serverpath = f"https://{server}.cz"

 ········


In [7]:
fullurl = serverpath + document

In [17]:
import pandas as pd

df = pd.read_csv(fullurl, na_values=["NA", "?"])
display(df)

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.100000,1,9.017895,35,11.738935,49,0.885827,0.492126,0.071100,b
1,2,kd,c,60369.0,18.625000,2,7.766643,59,6.805396,51,0.874016,0.342520,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,vv,c,51017.0,38.233333,1,5.454545,34,14.013489,41,0.881890,0.744094,0.104838,b
1996,1997,kl,d,26576.0,33.358333,2,3.632069,20,8.380497,38,0.944882,0.877953,0.063851,a
1997,1998,kl,d,28595.0,39.425000,3,7.168218,99,4.626950,36,0.759843,0.744094,0.098703,f
1998,1999,qp,c,67949.0,5.733333,0,8.936292,26,3.281439,46,0.909449,0.598425,0.117803,c


In [14]:
# Cross-Validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf

splitted = kf.split(df["product"])
for i in splitted:
    print(i)

(array([   0,    1,    2, ..., 1997, 1998, 1999]), array([  23,   29,   30,   32,   44,   45,   49,   56,   59,   63,   65,
         67,   69,   70,   73,   76,   78,   99,  100,  109,  111,  115,
        120,  123,  124,  128,  135,  162,  163,  168,  173,  175,  185,
        188,  194,  196,  203,  210,  211,  212,  218,  220,  231,  233,
        237,  239,  247,  251,  254,  256,  261,  266,  270,  275,  281,
        289,  297,  298,  300,  303,  305,  306,  307,  316,  322,  324,
        331,  342,  344,  350,  351,  352,  353,  354,  361,  366,  367,
        368,  374,  382,  383,  393,  394,  411,  414,  416,  422,  427,
        429,  432,  433,  438,  450,  453,  462,  464,  471,  478,  479,
        480,  482,  485,  494,  495,  507,  514,  519,  526,  527,  529,
        530,  534,  535,  538,  543,  544,  552,  554,  555,  570,  572,
        579,  581,  582,  583,  584,  585,  591,  599,  602,  607,  610,
        611,  613,  617,  618,  620,  628,  630,  637,  651,  654,  670,


## D

In [18]:
from scipy.stats import zscore

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job",dtype=int)],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area",dtype=int)],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product",dtype=int)],axis=1)
df.drop('product', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])

In [24]:
import copy
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Convert to PyTorch Tensors
x_columns = df.columns.drop(['age', 'id'])
x = torch.tensor(df[x_columns].values, dtype=torch.float32, device=device)
y = torch.tensor(df['age'].values, dtype=torch.float32, device=device).view(-1, 1)

# Set random seed for reproducibility
torch.manual_seed(42)

# Cross-Validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Early stopping parameters
patience = 10

fold = 0
for train_idx, test_idx in kf.split(x):
    fold += 1
    print(f"Fold #{fold}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # PyTorch DataLoader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Create the model and optimizer
    model = nn.Sequential(
        nn.Linear(x.shape[1], 20),
        nn.ReLU(),
        nn.Linear(20, 10),
        nn.ReLU(),
        nn.Linear(10, 1)
    )
    model = torch.compile(model,backend="aot_eager").to(device)


    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # Early Stopping variables
    best_loss = float('inf')
    early_stopping_counter = 0

    # Training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping()

    while not done and epoch<EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_output = model(x_test)
            val_loss = loss_fn(val_output, y_test)

        if es(model, val_loss):
            done = True

    print(f"Epoch {epoch}/{EPOCHS}, Validation Loss: "
      f"{val_loss.item()}, {es.status}")

# Final evaluation
model.eval()
with torch.no_grad():
    oos_pred = model(x_test)
score = torch.sqrt(loss_fn(oos_pred, y_test)).item()
print(f"Fold score (RMSE): {score}")

Fold #1
Epoch 157/500, Validation Loss: 0.7110837697982788, Early stopping triggered after 5 epochs.
Fold #2
Epoch 149/500, Validation Loss: 0.4980891942977905, Early stopping triggered after 5 epochs.
Fold #3
Epoch 151/500, Validation Loss: 0.7317754626274109, Early stopping triggered after 5 epochs.
Fold #4
Epoch 191/500, Validation Loss: 0.42949116230010986, Early stopping triggered after 5 epochs.
Fold #5
Epoch 139/500, Validation Loss: 1.2475146055221558, Early stopping triggered after 5 epochs.
Fold score (RMSE): 1.1104768514633179


**Příklad**

> Natrénujte neuronové sítě pro predikci hodnot a2 a a14. Jako nezávislé proměnné použijte 's3','a8','a9','a10','a11','a12','a13','a15'.
>
> Použijte https://archive.ics.uci.edu/dataset/27/credit+approval