In [1]:
# import libraries to generate data
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# generate our data
X, y = make_classification(n_samples=10000, n_features=20, n_redundant=15, n_informative=5, random_state=4)

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

# Shape of the data
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [3]:
# Create Tree depth values
values = [i for i in range(1, 31)]

In [4]:
train_scores = []
test_scores = []

In [5]:
# evaluate the decision tree model
for i in values:
    model = DecisionTreeClassifier(max_depth=i, random_state=4)
    
    print(f"[INFO] training model with max_depth = {i}")
    # fit the model
    model.fit(X_train, y_train)
    
    # Evaluate the model
    t_y_hat = model.predict(X_train)
    train_acc = accuracy_score(y_train, t_y_hat)
    train_scores.append(train_acc)
    
    # evaluate on the validation set
    test_y_hat = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_y_hat)
    test_scores.append(test_acc)

In [6]:
## Let's plot the results
import matplotlib.pyplot as plt
plt.plot(values, train_scores, '-o',  label="train")
plt.plot(values, test_scores, '-o', label="test")
plt.legend()
plt.show()

In [7]:
import numpy as np

# Generate some random data
np.random.seed(42)
X = np.sort(5 * np.random.rand(100, 1), axis=0)
y = np.sin(X).ravel() + np.random.randn(100) * 0.5

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=1)
X_poly = poly.fit_transform(X)

# Fit linear regression model
model = LinearRegression()
model.fit(X_poly, y)

LinearRegression()

In [9]:
import matplotlib.pyplot as plt

# Plot data and model predictions
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X_poly), color='red', linewidth=2)
plt.title('Underfitting Linear Regression Model')
plt.xlabel('X')
plt.ylabel('y')
plt.show()

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)

# Fit linear regression model
model = LinearRegression()
model.fit(X_poly, y)

LinearRegression()

In [11]:
import matplotlib.pyplot as plt

# Plot data and model predictions
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X_poly), color='red', linewidth=2)
plt.title('Improved Linear Regression Model')
plt.xlabel('X')
plt.ylabel('y')
plt.show()

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold,
)

In [13]:
## Source https://www.kaggle.com/code/robikscube/cross-validation-visualized-youtube-tutorial/notebook and Scikit-Learn documentation


from matplotlib.patches import Patch
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots()
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )


def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=25):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax


def plot_cv(cv, X, y, groups, n_splits=5):
    this_cv = cv(n_splits=n_splits)
    fig, ax = plt.subplots(figsize=(15, 5))
    plot_cv_indices(this_cv, X, y, groups, ax, n_splits)

    ax.legend(
        [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
        ["Testing set", "Training set"],
        loc=(1.02, 0.8),
    )
    plt.tight_layout()
    fig.subplots_adjust(right=0.7)
    plt.show()
    
def get_fake_X_y():
    # Fake Generate the class/group data for an example
    n_points = 100
    X_ = np.random.randn(100, 10)

    percentiles_classes = [0.1, 0.9]
    y_ = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)])

    # Evenly spaced groups repeated once
    groups_ = np.hstack([[ii] * 10 for ii in range(10)])
    return X_, y_, groups_

In [14]:
# create some sample data
X = np.random.randn(100, 10)
y = np.random.randint(0, 3, 100)

In [15]:
# define the number of folds for cross-validation
k = 5

# create an instance of StratifiedKFold
skf = KFold(n_splits=k, shuffle=True)

# create an empty list to store the cross-validation scores
scores = []

In [16]:
# loop over each fold
for train_index, test_index in skf.split(X, y):
    # split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit a multinomial logistic regression model on the training data
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X_train, y_train)

    # make predictions on the testing data
    y_pred = model.predict(X_test)

    # compute the accuracy score of the model on the testing data
    score = accuracy_score(y_test, y_pred)

    # add the score to the list of cross-validation scores
    scores.append(score)

    # Print the k-fold index
    print(f"Fold: {len(scores)}/{k}: Accuracy: {score:.3f}")
    
# compute the mean and standard deviation of the cross-validation scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# print the mean and standard deviation of the cross-validation scores
print(f"Mean accuracy: {mean_score:.3f}, Standard deviation: {std_score:.3f}")

In [17]:
kf = KFold()
X_, y_, groups_ = get_fake_X_y()
plot_cv(KFold, X_, y_, groups_)

In [18]:
# define the number of folds for cross-validation
k = 5

# create an instance of StratifiedKFold
skf = StratifiedKFold(n_splits=k, shuffle=True)

# create an empty list to store the cross-validation scores
scores = []

In [19]:
# loop over each fold
for train_index, test_index in skf.split(X, y):
    # split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit a multinomial logistic regression model on the training data
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X_train, y_train)

    # make predictions on the testing data
    y_pred = model.predict(X_test)

    # compute the accuracy score of the model on the testing data
    score = accuracy_score(y_test, y_pred)

    # add the score to the list of cross-validation scores
    scores.append(score)

    # Print the k-fold index
    print(f"Fold: {len(scores)}/{k}: Accuracy: {score:.3f}")
    
# compute the mean and standard deviation of the cross-validation scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# print the mean and standard deviation of the cross-validation scores
print(f"Mean accuracy: {mean_score:.3f}, Standard deviation: {std_score:.3f}")

In [20]:
skf = StratifiedKFold()
X_, y_, groups_ = get_fake_X_y()
plot_cv(StratifiedKFold, X_, y_, groups_)

In [21]:
# define the number of folds for cross-validation
k = 5

# create an instance of StratifiedKFold
skf = GroupKFold(n_splits=k)

# create an empty list to store the cross-validation scores
scores = []

# groups
groups = np.hstack([[ii] * 10 for ii in range(10)])
groups

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [22]:
# loop over each fold
for train_index, test_index in skf.split(X, y, groups=groups):
    # split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit a multinomial logistic regression model on the training data
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X_train, y_train)

    # make predictions on the testing data
    y_pred = model.predict(X_test)

    # compute the accuracy score of the model on the testing data
    score = accuracy_score(y_test, y_pred)

    # add the score to the list of cross-validation scores
    scores.append(score)

    # Print the k-fold index
    print(f"Fold: {len(scores)}/{k}: Accuracy: {score:.3f}")
    
# compute the mean and standard deviation of the cross-validation scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# print the mean and standard deviation of the cross-validation scores
print(f"Mean accuracy: {mean_score:.3f}, Standard deviation: {std_score:.3f}")

In [23]:
gskf = StratifiedGroupKFold()
X_, y_, groups_ = get_fake_X_y()
np.random.shuffle(y_)
plot_cv(StratifiedGroupKFold, X_, y_, groups_)

In [24]:
# we will use tensorflow and work with the MNIST dataset
import wandb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [25]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [26]:
## examine the data
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [27]:
x_train[0]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,
         18,  18,  18, 126, 136, 175,  26, 166, 255, 247, 127,   0,   0,
          0,   0],
       [  

In [28]:
## Let's look at some of the images
import random
import matplotlib.pyplot as plt

# create a subplot 5x5 with random images
fig, axes = plt.subplots(5, 5, figsize=(10, 10))
axes = axes.flatten()
for ax in axes:
    img = random.choice(x_train)
    ax.imshow(img, cmap='gray')
    ax.axis('off')

In [29]:
x_train = x_train.reshape(-1, 28, 28, 1).astype("float32") / 255.0
x_test = x_test.reshape(-1, 28, 28, 1).astype("float32") / 255.0

y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

In [30]:
def create_model():
    model = keras.Sequential(
        [
            layers.InputLayer(input_shape=(28, 28, 1)),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dropout(0.5),
            layers.Dense(10, activation="softmax"),
        ]
    )
    return model

In [31]:
#### Print the model summary
model = create_model()
model.summary()

In [32]:
hyperparams = {
    "learning_rate": 0.001,
    "batch_size": 128,
    "epochs": 10
}

In [33]:
# log results in wandb
WANDB_NOTEBOOK_NAME = "mnist-classification"

wandb.init(project="mnist-classification", config=hyperparams)

model = create_model()

optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])

model.compile(
    optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
)

callbacks = [
    wandb.keras.WandbCallback(save_model=False)
]

In [34]:
history = model.fit(
    x_train,
    y_train,
    batch_size=hyperparams["batch_size"],
    epochs=hyperparams["epochs"],
    validation_data=(x_test, y_test),
    callbacks=callbacks,
)

wandb.log({"val_loss": history.history["val_loss"][-1], "val_accuracy": history.history["val_accuracy"][-1]})

In [35]:
hyperparams = {
    "learning_rate": 0.001,
    "batch_size": 128,
    "epochs": 20
}

In [36]:
# log results in wandb
WANDB_NOTEBOOK_NAME = "mnist-classification"

wandb.init(project="mnist-classification", config=hyperparams)

model = create_model()

optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])

model.compile(
    optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
)

callbacks = [
    wandb.keras.WandbCallback(save_model=False)
]