In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import CactusDataset
from sklearn.metrics import roc_curve, auc


In [14]:
#use_gpu = torch.cuda.is_available()
#if use_gpu:
#    print("Using CUDA")

In [39]:
data_dir = '../../../data/train/train'
csv_file = '../../../data/train.csv'
TRAIN = 'train'
VAL = 'val'
TEST = 'test'

#set the transforms for the images
data_transforms = transforms.Compose([
    transforms.ToTensor(),
])


torch.manual_seed(42)
image_dataset = CactusDataset.CactusDataset(csv_file, data_dir, data_transforms)
# split into train, eval, test
train_size = int(0.7 * len(image_dataset))
eval_size = int(0.2 * len(image_dataset))
test_size = len(image_dataset) - train_size - eval_size
train_dataset, eval_dataset, test_dataset = torch.utils.data.random_split(image_dataset, [train_size, eval_size, test_size])

image_datasets = {TRAIN: train_dataset, VAL: eval_dataset, TEST: test_dataset}

dataset_sizes = {x: len(image_datasets[x]) for x in [TRAIN, VAL, TEST]}

batch_sizes = {TRAIN: dataset_sizes[TRAIN], VAL: dataset_sizes[VAL], TEST: 64}

dataloaders = {TRAIN: None, VAL: None, TEST: None}
dataloaders[TRAIN] = DataLoader(image_datasets[TRAIN], batch_size=batch_sizes[TRAIN],
                                             shuffle=True, num_workers=0)
dataloaders[VAL] = DataLoader(image_datasets[VAL], batch_size=batch_sizes[VAL],
                                             shuffle=False, num_workers=0)
dataloaders[TEST] = DataLoader(image_datasets[TEST], batch_size=batch_sizes[TEST], shuffle=False, num_workers=0)


print(dataset_sizes)
class_names = {0: 'No Cactus', 1: 'Cactus'}

inputs, classes = next(iter(dataloaders[TRAIN]))
print(inputs[0])

n_features = 32 * 32 * 3

print(n_features)

#print nunmber of each class in each dataset
for x in [TRAIN, VAL, TEST]:
    print("Number of {} images: {}".format(x, dataset_sizes[x]))
    for i in range(2):
        print("Number of {} images of class {}: {}".format(x, class_names[i], sum([1 for j in image_datasets[x] if j[1] == i])))
    print()




In [40]:
# Create model
# f = wx + b, sigmoid at the end
class LogisticRegression(nn.Module):

    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_predicted = torch.sigmoid(self.linear(x))
        return y_predicted

In [56]:
def train_model(model, criterion, optimizer, model_name, X_train, y_train, X_val, y_val, transforms=None, num_epochs=1000):
    # training loop
    num_epochs = num_epochs
    epoch_best = 0
    f1_scores = []
    accs_train = []
    accs_val = []
    aucs = []
    best_fpr = []
    best_tpr = []
    best_auc = 0
    best_acc = 0
    best_f1 = 0
    

    #40k epoche runnate
    for epoch in range(num_epochs):
        #randomly horizontal flip the images in X_train
        if transforms is not None:
            X_train = transforms(X_train)
        #use mini batches
        # forward pass and loss
        outputs = model(X_train)
        #make y_predicted and y_train of same shape
        outputs = outputs.view(-1)

        preds = outputs.round()
        acc_train = (preds == y_train).sum() / y_train.shape[0]
        accs_train.append(acc_train)

        loss = criterion(outputs, y_train.float())
        
        # backward pass
        loss.backward()
        
        # updates
        optimizer.step()
        
        # zero gradients
        optimizer.zero_grad()
        
        with torch.no_grad():
            outputs = model(X_val)  # no need to call model.forward()
            y_predicted_cls = outputs.round()   # round off to nearest class
            #flatten
            y_val = y_val.view(-1)
            y_predicted_cls = y_predicted_cls.view(-1)

            #Compute f1 score
            tp = (y_val * y_predicted_cls).sum()
            tn = ((1 - y_val) * (1 - y_predicted_cls)).sum()
            fp = ((1 - y_val) * y_predicted_cls).sum()
            fn = (y_val * (1 - y_predicted_cls)).sum()
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * precision * recall / (precision + recall)
            f1_scores.append(f1)

            #Compute accuracy
            acc = (y_predicted_cls == y_val).sum() / y_val.shape[0]   # accuracy
            accs_val.append(acc)

            #Compute AUC
            fpr, tpr, _ = roc_curve(y_val, outputs)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            
            if roc_auc > best_auc:
                best_auc = roc_auc
                best_fpr = fpr
                best_tpr = tpr
                epoch_best = epoch
            if f1 > best_f1:
                best_f1 = f1
            if acc > best_acc:
                best_acc = acc

            if epoch % 100 == 0:
                print(f'Epoch: {epoch}, Loss: {loss.item()}, Accuracy on val: {acc}, F1 score on val: {f1}, AUC on val: {roc_auc}')
    return accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best

## LR without PCA

In [42]:
#Load dataset, we flatten the images and we put them in a single numpy array in memory of shape (n_images, n_features)
X_train = []
y_train = []
for images, labels in dataloaders[TRAIN]:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
for images, labels in dataloaders[VAL]:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_train[0])
n_features = X_train.shape[1]

In [46]:
#instantiate model
model = LogisticRegression(n_features)
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [47]:
# train model for 5000 epochs
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train, y_train, X_val, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()

## LR with PCA

In [48]:
from sklearn.decomposition import PCA

In [68]:
#Load dataset, we flatten the images and we put them in a single numpy array in memory of shape (n_images, n_features)
X_train = []
y_train = []
for images, labels in dataloaders[TRAIN]:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
for images, labels in dataloaders[VAL]:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)

In [69]:
# retain only n_componenets feature with highest variance using PCA
n_components = 612
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train.detach().numpy())
X_val_pca = pca.transform(X_val.detach().numpy())
#transform in tensors
X_train_pca = torch.tensor(X_train_pca)
X_val_pca = torch.tensor(X_val_pca)


In [70]:
model = LogisticRegression(X_train_pca.shape[1])
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [71]:
#Train model using pca features
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train_pca, y_train, X_val_pca, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()


## LR with data standardization and PCA

In [72]:
from sklearn.decomposition import PCA

In [73]:
X_train = []
y_train = []
for images, labels in dataloaders[TRAIN]:
    
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
for images, labels in dataloaders[VAL]:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)

In [74]:
# retain only n_componenets feature with highest variance using PCA
n_components = 612
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train.detach().numpy())
X_val_pca = pca.transform(X_val.detach().numpy())
#transform in tensors
print(X_train_pca.shape)
X_train_pca = torch.tensor(X_train_pca)
X_val_pca = torch.tensor(X_val_pca)

#compute mean of X_train_pca
mean = X_train_pca.mean().item()
std = X_train_pca.std().item()

print(mean)
print(std)

#standardize X_train_pca and X_val_pca using mean and std
X_train_pca = (X_train_pca - mean) / std
X_val_pca = (X_val_pca - mean) / std


In [75]:

model = LogisticRegression(X_train_pca.shape[1])
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [76]:

#Train model using pca features
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train_pca, y_train, X_val_pca, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()


In [103]:
# show memory occupation of the model
# save model on memory
torch.save(model, '../../../data/LRmodel.pth')

In [119]:
from torch.profiler import profile, record_function, ProfilerActivity
with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        model(X_val_pca)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

## LR with PCA, normalization and histogram equalization

In [77]:
from sklearn.decomposition import PCA

In [78]:
data_dir = '../../../data/train/train'
csv_file = '../../../data/train.csv'
TRAIN = 'train'
VAL = 'val'
TEST = 'test'

#set the transforms for the images
data_transforms = transforms.Compose([
    transforms.RandomEqualize(p=1),
    transforms.ToTensor(),
])


torch.manual_seed(42)
image_dataset = CactusDataset.CactusDataset(csv_file, data_dir, data_transforms)
# split into train, eval, test
train_size = int(0.7 * len(image_dataset))
eval_size = int(0.2 * len(image_dataset))
test_size = len(image_dataset) - train_size - eval_size
train_dataset, eval_dataset, test_dataset = torch.utils.data.random_split(image_dataset, [train_size, eval_size, test_size])


In [79]:
X_train = []
y_train = []
dataloader = DataLoader(train_dataset, batch_size=train_size, shuffle=True, num_workers=0)
for images, labels in dataloader:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
dataloader = DataLoader(eval_dataset, batch_size=eval_size, shuffle=False, num_workers=0)
for images, labels in dataloader:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)

In [80]:
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)
#plot number of features vs explained variance
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) > 0.99)
plt.scatter(n_components, 0.99, color='r')
plt.text(n_components, 0.99, '({}, {})'.format(n_components, 0.99), color='black')

plt.show()

#print n_components for 95% variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) > 0.99)
print('Number of components for 95% variance: ', n_components)

# retain only n_componenets feature with highest variance using PCA
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train.detach().numpy())
X_val_pca = pca.transform(X_val.detach().numpy())
#transform in tensors
print(X_train_pca.shape)
X_train_pca = torch.tensor(X_train_pca)
X_val_pca = torch.tensor(X_val_pca)

#compute mean of X_train_pca
mean = X_train_pca.mean().item()
std = X_train_pca.std().item()

print(mean)
print(std)

#standardize X_train_pca and X_val_pca using mean and std
X_train_pca = (X_train_pca - mean) / std
X_val_pca = (X_val_pca - mean) / std


In [81]:
model = LogisticRegression(X_train_pca.shape[1])
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [82]:
#Train model using pca features
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train_pca, y_train, X_val_pca, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()


## LR with oversampling

In [83]:
from sklearn.decomposition import PCA

In [84]:
data_dir = '../../../data/train/train'
csv_file = '../../../data/train.csv'
TRAIN = 'train'
VAL = 'val'
TEST = 'test'

#set the transforms for the images
data_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
])


torch.manual_seed(42)
image_dataset = CactusDataset.CactusDataset(csv_file, data_dir, data_transforms)
print(image_dataset.get_class_distribution())


image_dataset.oversample()
print(len(image_dataset))

# print number of classes
print(image_dataset.get_class_distribution())

# split into train, eval, test
train_size = int(0.7 * len(image_dataset))
eval_size = int(0.2 * len(image_dataset))
test_size = len(image_dataset) - train_size - eval_size
train_dataset, eval_dataset, test_dataset = torch.utils.data.random_split(image_dataset, [train_size, eval_size, test_size])

n_features = 32 * 32 * 3

In [85]:
X_train = []
y_train = []
dataloader = DataLoader(train_dataset, batch_size=train_size, shuffle=True, num_workers=0)
for images, labels in dataloader:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
dataloader = DataLoader(eval_dataset, batch_size=eval_size, shuffle=False, num_workers=0)
for images, labels in dataloader:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)
n_features = X_train.shape[1]

In [86]:
# retain only n_componenets feature with highest variance using PCA
n_components = 612
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train.detach().numpy())
X_val_pca = pca.transform(X_val.detach().numpy())
#transform in tensors
X_train_pca = torch.tensor(X_train_pca)
X_val_pca = torch.tensor(X_val_pca)

#Standardize
mean = X_train_pca.mean().item()
std = X_train_pca.std().item()
X_train_pca = (X_train_pca - mean) / std
X_val_pca = (X_val_pca - mean) / std

In [88]:
#instantiate model
model = LogisticRegression(n_components)
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [89]:
# train model for 5000 epochs
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train_pca, y_train, X_val_pca, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()

## LR with PCA and standardization

In [143]:
import torch.utils
import torch.utils.data


data_dir = '../../../data/train/train'
csv_file = '../../../data/train.csv'
TRAIN = 'train'
TRAIN2 = 'train2'
VAL = 'val'
TEST = 'test'

#set the transforms for the images
data_transforms = transforms.Compose([
    transforms.ToTensor(),
])


torch.manual_seed(42)
image_dataset = CactusDataset.CactusDataset(csv_file, data_dir, data_transforms)
# split into train, eval, test
train_size = int(0.7 * len(image_dataset))
eval_size = int(0.2 * len(image_dataset))
test_size = len(image_dataset) - train_size - eval_size
train_dataset, eval_dataset, test_dataset = torch.utils.data.random_split(image_dataset, [train_size, eval_size, test_size])

image_datasets = {TRAIN: train_dataset, VAL: eval_dataset, TEST: test_dataset}

data_dir = '../../../data/test/test'
csv_file = '../../../data/test.csv'
new_dataset = CactusDataset.CactusDataset(csv_file, data_dir, data_transforms)

#print new_dataset
print(len(new_dataset))
print(new_dataset.get_class_distribution())

dataset_sizes = {x: len(image_datasets[x]) for x in [TRAIN, VAL, TEST]}

batch_sizes = {TRAIN: dataset_sizes[TRAIN], VAL: dataset_sizes[VAL], TEST: 64}

dataloaders = {TRAIN: None, TRAIN2: None, VAL: None, TEST: None}
dataloaders[TRAIN] = DataLoader(image_datasets[TRAIN], batch_size=batch_sizes[TRAIN],
                                             shuffle=True, num_workers=0)
dataloaders[TRAIN2] = DataLoader(new_dataset, batch_size=len(new_dataset), shuffle=False, num_workers=0)
dataloaders[VAL] = DataLoader(image_datasets[VAL], batch_size=batch_sizes[VAL],
                                             shuffle=False, num_workers=0)
dataloaders[TEST] = DataLoader(image_datasets[TEST], batch_size=batch_sizes[TEST], shuffle=False, num_workers=0)


print(dataset_sizes)
class_names = {0: 'No Cactus', 1: 'Cactus'}


n_features = 32 * 32 * 3

print(n_features)

#print nunmber of each class in each dataset
for x in [TRAIN, VAL, TEST]:
    print("Number of {} images: {}".format(x, dataset_sizes[x]))
    for i in range(2):
        print("Number of {} images of class {}: {}".format(x, class_names[i], sum([1 for j in image_datasets[x] if j[1] == i])))
    print()

In [144]:
X_train = []
y_train = []
for images, labels in dataloaders[TRAIN]:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
for images, labels in dataloaders[TRAIN2]:
    images = images.view(-1, n_features)
    X_train.append(images)
    y_train.append(labels)
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)
X_val = []
y_val = []
for images, labels in dataloaders[VAL]:
    images = images.view(-1, n_features)
    X_val.append(images)
    y_val.append(labels)
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0)

In [145]:
# retain only n_componenets feature with highest variance using PCA
n_components = 612
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train.detach().numpy())
X_val_pca = pca.transform(X_val.detach().numpy())
#transform in tensors
print(X_train_pca.shape)
X_train_pca = torch.tensor(X_train_pca)
X_val_pca = torch.tensor(X_val_pca)

#compute mean of X_train_pca
mean = X_train_pca.mean().item()
std = X_train_pca.std().item()

print(mean)
print(std)

#standardize X_train_pca and X_val_pca using mean and std
X_train_pca = (X_train_pca - mean) / std
X_val_pca = (X_val_pca - mean) / std


In [146]:

model = LogisticRegression(X_train_pca.shape[1])
# Loss and optimizer
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [149]:

#Train model using pca features
accs_train, accs_val, aucs, f1_scores, best_fpr, best_tpr, best_auc, best_f1, epoch_best = train_model(model, criterion, optimizer, 'LR', X_train_pca, y_train, X_val_pca, y_val, num_epochs=5000)
print(f'Best AUC: {best_auc}, Best F1: {best_f1}, Best epoch: {epoch_best}')
#plot accuracies of val and training
plt.plot(accs_train, label='Train')
plt.plot(accs_val, label='Val')
plt.legend()
plt.show()
