In [1]:
from sklearn.datasets import load_breast_cancer # The data set used
from sklearn.preprocessing import StandardScaler # Normalizes inputs
from sklearn.preprocessing import PolynomialFeatures # Combine features of observable domain and construct new domain
from sklearn.ensemble import RandomForestClassifier # Creates an aggregation of decision trees
from sklearn.feature_selection import RFECV # Recursive Feature Elimination with Cross Validation (eliminate feature noise)
from sklearn.model_selection import train_test_split # Splits test data and training data to avoid overfitting

In [2]:
cancer = load_breast_cancer() # Load the data set
x_scaled = StandardScaler().fit_transform(cancer.data) # Re-centers and re-scales values

print("Original data (rows, features):", x_scaled.shape)

Original data (rows, features): (569, 30)


In [3]:
%%time
poly = PolynomialFeatures(2) # Take 30 original features and create synthetic features
x_poly = poly.fit_transform(x_scaled)

print("All polynomial featues (order 2):", x_poly.shape)

All polynomial featues (order 2): (569, 496)
Wall time: 6.98 ms


In [4]:
%%time
# Create a Random Forest with a maximum depth of 7, 10 trees, 1 random state for replication
rfc = RandomForestClassifier(max_depth = 7, n_estimators = 10, random_state = 1) 

# Create a random forest classifier using the rfc constructed earlier, split into 5 pieces to compare
rfecv = RFECV(estimator = rfc, cv = 5, n_jobs = -1) 
x_poly_top = rfecv.fit_transform(x_poly, cancer.target) # get top features

print("Best polynomial features: ", x_poly_top.shape)

Best polynomial features:  (569, 278)
Wall time: 36 s


In [5]:
%%time
# Split training and testing data from the trimmed dataset with a random_state of 42 for reproducibility
x_train, x_test, y_train, y_test = train_test_split(x_poly_top, cancer.target, random_state = 42)

# Create a Random Forest with a maximum depth of 7, 10 trees, 1 random state for replication
rfc = RandomForestClassifier(max_depth = 7, n_estimators = 10, random_state = 1)

# Train the model using training data and score it's accuracy using the test data
acc = rfc.fit(x_train, y_train).score(x_test, y_test)

print("Test accuracy: {:.0f}%".format(100. * acc))

Test accuracy: 94%
Wall time: 42.9 ms


In [6]:
batch_size = 32 # How many observations at a time for the neural network
in_dim = cancer.data.shape[1] # Size of the input layer (shape of the data 30 features)
hidden1 = x_poly_top.shape[1] # Use polynomial features as neuron amount
hidden2 = 20 # Inference layer
out_dim = 1 # One binary output

batches_in_data = x_train.shape[0] / batch_size
epochs = int(5000 / batches_in_data) # Number of repetitions to train data
learning_rate = 1e-4 # Penalty to adjustment (reduce change compared to found data)

# Split training and testing data from the trimmed dataset with a random_state of 42 for reproducibility
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state = 42)

cancer.data.shape

(569, 30)

In [7]:
import numpy as np
import torch
from torch.autograd import Variable

In [8]:
# Create a sequential model (do a sequence of layers)
model_t = torch.nn.Sequential(
    # Linear model (dense layer to simulate polynomial feature generation)
    torch.nn.Linear(in_dim, hidden1),
    # Activation function
    torch.nn.ReLU(),
    # Linear model for inference
    torch.nn.Linear(hidden1, hidden2),
    # Leaky RELU activation function
    torch.nn.LeakyReLU(),
    # Dropout layer to prevent over fitting (randomly make neurons ineffective to prevent co-adaptation by preventing too much weight being put on a neuron)
    torch.nn.Dropout(p = 0.25),
    # Sigmoid activation function
    torch.nn.Linear(hidden2, out_dim),
    # Sigmoid function is a sloped function to get close to 1 or 0 (binary decision)
    torch.nn.Sigmoid()
)
# Loss function: how do you compute how much error that you did in the previous pass
# Optimizers: how you do the back propogation

In [9]:
model_t

Sequential(
  (0): Linear(in_features=30, out_features=278, bias=True)
  (1): ReLU()
  (2): Linear(in_features=278, out_features=20, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Dropout(p=0.25, inplace=False)
  (5): Linear(in_features=20, out_features=1, bias=True)
  (6): Sigmoid()
)

In [10]:
from torch import device, cuda
from torchsummary import summary

# Check to see if the python is compiled for CUDA or CPU compute
if cuda.is_available():
    model_t = model_t.to(device('cuda'))

# Print a summary of the model
summary(model_t, input_size = (1, in_dim))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 278]           8,618
              ReLU-2               [-1, 1, 278]               0
            Linear-3                [-1, 1, 20]           5,580
         LeakyReLU-4                [-1, 1, 20]               0
           Dropout-5                [-1, 1, 20]               0
            Linear-6                 [-1, 1, 1]              21
           Sigmoid-7                 [-1, 1, 1]               0
Total params: 14,219
Trainable params: 14,219
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.05
Estimated Total Size (MB): 0.06
----------------------------------------------------------------


In [27]:
show_every = 250 # How often is information displayed

def train():
    for epoch in range(5000):
        # Get inital predictions
        y_pred = model_t(x_train_torch)
        loss = loss_fn(y_pred, y_train_torch) # Calculate loss between predictions and actual
        
        if not epoch % show_every: # for every 250 epochs, run a comparison between test data and trained data
            y_test_pred = model_t(Variable(x_test_torch))
            prediction = [int(val > 0.5) for val in y_test_pred.cpu().data.numpy()] # Generated a casted binary array from the predictions
            test_accuracy = (prediction == y_test).sum() / len(y_test) # Sum together the number of correct predictions and see how many worked out of all the test data
            train_pred = [int(val > 0.5) for val in y_pred.cpu().data.numpy()] # Generated a casted binary array from the predictions
            train_accuracy = (train_pred == y_train).sum() / len(y_train) # Sum together the number of correct predictions and see how many worked out of all the train data
            print("Batch: %04d | Training Loss: %6.2f | Train Accuracy: %.4f | Test Accuracy: %.4f"%(epoch, loss.item(), train_accuracy, test_accuracy)) # Print out results
        
        # Remove previous pass differences
        optimizer.zero_grad()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()

In [30]:
%%time

# Bring over data from numpy
x_train_torch = torch.from_numpy(x_train).float()
y_train_torch = torch.from_numpy(y_train[:, np.newaxis]).float()
x_test_torch = torch.from_numpy(x_test).float()
y_test_torch = torch.from_numpy(y_test[:, np.newaxis]).float()

# Check to see if the python is compiled for CUDA and if so convert models to be compatible
if cuda.is_available():
    x_train_torch = x_train_torch.to(device('cuda'))
    y_train_torch = y_train_torch.to(device('cuda'))
    x_test_torch = x_test_torch.to(device('cuda'))
    y_test_torch = y_test_torch.to(device('cuda'))
    
loss_fn = torch.nn.MSELoss(reduction = 'sum') # define your loss function
optimizer = torch.optim.Adam(model_t.parameters(), lr = learning_rate) # define your optimizer
train() # run the training

Batch: 0000 | Training Loss:   3.27 | Train Accuracy: 0.9906 | Test Accuracy: 0.9510
Batch: 0250 | Training Loss:   2.88 | Train Accuracy: 0.9953 | Test Accuracy: 0.9441
Batch: 0500 | Training Loss:   2.82 | Train Accuracy: 0.9953 | Test Accuracy: 0.9441
Batch: 0750 | Training Loss:   2.87 | Train Accuracy: 0.9930 | Test Accuracy: 0.9510
Batch: 1000 | Training Loss:   2.15 | Train Accuracy: 0.9977 | Test Accuracy: 0.9301
Batch: 1250 | Training Loss:   2.09 | Train Accuracy: 0.9953 | Test Accuracy: 0.9371
Batch: 1500 | Training Loss:   1.94 | Train Accuracy: 0.9977 | Test Accuracy: 0.9301
Batch: 1750 | Training Loss:   2.04 | Train Accuracy: 0.9977 | Test Accuracy: 0.9441
Batch: 2000 | Training Loss:   1.90 | Train Accuracy: 0.9953 | Test Accuracy: 0.9510
Batch: 2250 | Training Loss:   2.19 | Train Accuracy: 0.9953 | Test Accuracy: 0.9650
Batch: 2500 | Training Loss:   2.04 | Train Accuracy: 0.9977 | Test Accuracy: 0.9371
Batch: 2750 | Training Loss:   1.70 | Train Accuracy: 0.9953 | Te