## TODO
* Train_test split on data 
* Import and transform data
* Create model
* Visualize results

In [65]:
%%capture
!pip install torch
!pip install torchvision

In [96]:
import torch
import torchvision
import torch.utils.data
from torch import nn
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import glob
import random
import os

In [71]:
for folder in os.listdir('data'):
    print(f'{folder} - {len(os.listdir(os.path.join("data",folder)))} images')

house_data - 5249 images
street_data - 19658 images


In [73]:
#Random seeds prepared for consistency among splitting & training processes
seed_val = 1903
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [75]:
means = np.array([0, 0, 0], dtype=np.float32)
stds = np.array([0, 0, 0], dtype=np.float32)
total_images = 0
sample_size = 1000
for f in tqdm.tqdm(random.sample(glob.glob("data/**/*.jpg", recursive = True), sample_size)):
    img = plt.imread(f)
    means += img.mean(axis=(0,1))
    stds += img.std(axis=(0,1))
    total_images += 1
means = means / (total_images * 255.)
stds = stds / (total_images * 255.)
print("Total images: ", total_images)
print("Means: ", means)
print("Stds: ", stds)

100%|██████████| 1000/1000 [00:08<00:00, 115.44it/s]

Total images:  1000
Means:  [0.50170106 0.5009038  0.4761459 ]
Stds:  [0.19799496 0.19974951 0.22386898]





In [76]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
    ])

In [80]:
dataset = datasets.ImageFolder('data', transform=transform)

In [93]:
def split_dataset(dataset_size, test_size, val_size, shuffle=True, random_state=1903):
    indices = list(range(dataset_size))
    np.random.shuffle(indices)
    split_test = int(np.floor(dataset_size*(1-(test_size+val_size))))
    split_val = int(np.floor(dataset_size*(1-val_size)))
    train_indices = indices[:split_test]
    test_indices = indices[split_test:split_val]
    val_indices = indices[split_val:]
    return train_indices, test_indices, val_indices

In [94]:
train_indices, test_indices, val_indices = split_dataset(len(dataset), test_size=0.1, val_size=0.1)

In [98]:
batch_size = 32
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, drop_last=True)
val_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler, drop_last=True)
test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler, drop_last=True)

In [100]:
val_dataloader.dataset

Dataset ImageFolder
    Number of datapoints: 24907
    Root location: data
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=[0.50170106 0.5009038  0.4761459 ], std=[0.19799496 0.19974951 0.22386898])
           )