# Build the Speech Model

Now that we have created the spectrogram images its time to build the computer vision model. If you are following along with the learning path then you already created a computer vision model in the second module in this path. We will be using the [torchvision]() package to build our vision model. Lets import the packages we need to build the model.

In [1]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms
import pandas as pd
import os



## Load Spectorgram Images into DataLoader for Training

Here we provide the path to our image data and use the [ImageFolder]() helper to load the images into tensors. The labels are created based on the name of the folders.

In [2]:
data_path = './.data/spectrograms' #looking in subfolder train

yes_no_dataset = datasets.ImageFolder(
    root=data_path,
    transform=transforms.Compose([transforms.Resize((201,81)),
                                  transforms.ToTensor()
                                  ])
)
print(yes_no_dataset)
print(yes_no_dataset[5][0].size())

Dataset ImageFolder
    Number of datapoints: 7985
    Root location: ./.data/spectrograms
    StandardTransform
Transform: Compose(
               Resize(size=(201, 81), interpolation=PIL.Image.BILINEAR)
               ToTensor()
           )
torch.Size([3, 201, 81])


## Split the Data
- Split the data to use 80% to train the model and 20% to test.

In [3]:
#split data to test and train
#use 80% to train
train_size = int(0.8 * len(yes_no_dataset))
test_size = len(yes_no_dataset) - train_size
yes_no_train_dataset, yes_no_test_dataset = torch.utils.data.random_split(yes_no_dataset, [train_size, test_size])

print(len(yes_no_train_dataset))
print(len(yes_no_test_dataset))

6388
1597


- Load the data into the DataLoader

In [4]:
train_dataloader = torch.utils.data.DataLoader(
    yes_no_train_dataset,
    batch_size=15,
    num_workers=2,
    shuffle=True
)

test_dataloader = torch.utils.data.DataLoader(
    yes_no_test_dataset,
    batch_size=15,
    num_workers=2,
    shuffle=True
)

- Lets take a look at what our tensor looks like

In [5]:
train_dataloader.dataset[0][0][0][0]

tensor([0.5608, 0.4824, 0.5098, 0.4353, 0.3490, 0.4824, 0.5843, 0.4392, 0.4667,
        0.3608, 0.4431, 0.5059, 0.4667, 0.4824, 0.4745, 0.4118, 0.4431, 0.2039,
        0.4510, 0.3765, 0.3804, 0.4118, 0.4745, 0.4706, 0.4392, 0.5020, 0.5020,
        0.5137, 0.5059, 0.3922, 0.3059, 0.3804, 0.4196, 0.3647, 0.3647, 0.4392,
        0.3451, 0.3804, 0.3608, 0.4078, 0.6392, 0.6980, 0.7529, 0.5333, 0.7137,
        0.8000, 0.5882, 0.6196, 0.7373, 0.6863, 0.6863, 0.6118, 0.5922, 0.4863,
        0.5725, 0.5608, 0.5098, 0.4745, 0.4745, 0.4431, 0.4706, 0.4000, 0.4039,
        0.4314, 0.4275, 0.3333, 0.1569, 0.4353, 0.4824, 0.4588, 0.4118, 0.3961,
        0.4314, 0.4118, 0.2667, 0.3686, 0.4980, 0.4941, 0.4275, 0.4863, 0.5922])

- Get GPU for training, else use CPU if GPU is not available

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))


Using cuda device


## Create Neural Netowrk
- Create the Convolutional Neural Network and set the device.

In [7]:
class CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51136, 50)
        self.fc2 = nn.Linear(50, 2)


    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        #x = x.view(x.size(0), -1)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        return F.log_softmax(x,dim=1)
    
model = CNNet().to(device)

print(model)

CNNet(
  (conv1): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.5, inplace=False)
  (flatten): Flatten()
  (fc1): Linear(in_features=51136, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=2, bias=True)
)


## Create Train and Test Functions
- Here we will set the cost function, learning_rate, and optimizer. Then set up the train and test functions that we will call next.

In [8]:
# cost function used to determine best parameters
cost = torch.nn.CrossEntropyLoss()

# used to create optimal parameters
learning_rate = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Create the training function

def train(dataloader, model, loss, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, Y) in enumerate(dataloader):
        X, Y = X.to(device), Y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = cost(pred, Y)
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f'loss: {loss:>7f}  [{current:>5d}/{size:>5d}]')


# Create the validation/test function

def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (X, Y) in enumerate(dataloader):
            X, Y = X.to(device), Y.to(device)
            pred = model(X)

            test_loss += cost(pred, Y).item()
            correct += (pred.argmax(1)==Y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size

    print(f'\nTest Error:\nacc: {(100*correct):>0.1f}%, avg loss: {test_loss:>8f}\n')

## Train the Model
- Now lets set the number of epochs and call our `train` and `test` functions for each epoch.

In [9]:
epochs = 15

for t in range(epochs):
    print(f'Epoch {t+1}\n-------------------------------')
    train(train_dataloader, model, cost, optimizer)
    test(test_dataloader, model)
print('Done!')

Epoch 1
-------------------------------
loss: 0.683419  [    0/ 6388]
loss: 0.695789  [ 1500/ 6388]
loss: 0.692987  [ 3000/ 6388]
loss: 0.666870  [ 4500/ 6388]
loss: 0.692348  [ 6000/ 6388]

Test Error:
acc: 49.7%, avg loss: 0.046441

Epoch 2
-------------------------------
loss: 0.693147  [    0/ 6388]
loss: 0.693147  [ 1500/ 6388]
loss: 0.693147  [ 3000/ 6388]
loss: 0.693147  [ 4500/ 6388]
loss: 0.723022  [ 6000/ 6388]

Test Error:
acc: 49.7%, avg loss: 0.046441

Epoch 3
-------------------------------
loss: 0.693147  [    0/ 6388]
loss: 0.666938  [ 1500/ 6388]
loss: 0.773418  [ 3000/ 6388]
loss: 0.666485  [ 4500/ 6388]
loss: 0.685241  [ 6000/ 6388]

Test Error:
acc: 49.7%, avg loss: 0.046441

Epoch 4
-------------------------------
loss: 0.695083  [    0/ 6388]
loss: 0.706812  [ 1500/ 6388]
loss: 0.620915  [ 3000/ 6388]
loss: 0.488353  [ 4500/ 6388]
loss: 0.471648  [ 6000/ 6388]

Test Error:
acc: 86.9%, avg loss: 0.022696

Epoch 5
-------------------------------
loss: 0.271708  [   

 ## Test the Model
 
Awesome! You should have got somewhere between a 93%-95% accuracy by the 15th epoch. Here we grab a batch from our test data and see how the model performs on the predicted vs the actual result. 

In [10]:
model.eval()
test_loss, correct = 0, 0

with torch.no_grad():
    for batch, (X, Y) in enumerate(test_dataloader):
        X, Y = X.to(device), Y.to(device)
        pred = model(X)
        print("Predicted:")
        print(f"{pred.argmax(1)}")
        print("Actual:")
        print(f"{Y}")
        break

Predicted:
tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0], device='cuda:0')
Actual:
tensor([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0], device='cuda:0')
