In [18]:
%load_ext autoreload
%autoreload 2
import torch
from PIL import Image
from torchvision import transforms
import os
import numpy as np
import pickle


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# THE CELL BELLOW CONTAINS THE FILENAMES TO CHANGE
This script follows this pytorch tutorial: https://pytorch.org/hub/pytorch_vision_resnet/

In [23]:
LABELED_DIR = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/data_image/hurricane_harvey/'
UNLABELED_DIR = '/Users/ianmagnusson/IITUDND/data/retrieved_data/images/harvey_images_complete/'
NPY_OUTPUT_DIR = '/Users/ianmagnusson/IITUDND/data/extracted_features/resnet/harvey/'

In [3]:
# load the model

model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True)
model.eval()
model.fc = torch.nn.Identity() # replace the last layer with a pass thru

Using cache found in /Users/ianmagnusson/.cache/torch/hub/pytorch_vision_master


In [11]:
# define image preprocessing
preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

# First get labeled data

In [None]:
date_dirs =os.listdir(LABELED_DIR)

count = 0
features_vectors = {}
image_sizes = {}

for date_dir in date_dirs:
    filepath = LABELED_DIR + date_dir
    for filename in os.listdir(filepath):
        input_image = Image.open(filepath +'/' +filename)
        image_sizes[filename] = input_image.size
        
        # remove alpha channel if transperency 
        # https://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
        if input_image.mode == 'RGBA':
            input_image = Image.fromarray(np.array(input_image)[:,:,:3], 'RGB') 
        
        # handle palleted images NOTE THIS WILL STILL RAISE WARNINGS but we are just removing the alpha channel
        # https://stackoverflow.com/questions/52307290/what-is-the-difference-between-images-in-p-and-l-mode-in-pil
        elif input_image.mode == 'P':
            input_image = input_image.convert('RGB')
        
        input_tensor = preprocess(input_image)
            
        input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
    
        # move the input and model to GPU for speed if available
        if torch.cuda.is_available():
            input_batch = input_batch.to('cuda')
            model.to('cuda')
        
        with torch.no_grad():
            features_vectors[filename] = model(input_batch)[0].cpu().numpy()
        
        count += 1
        if count % 200 == 199:
            print(count)

In [None]:
# save as a dictionary {filename : np.array}

with open(NPY_OUTPUT_DIR + 'labeled.npz', 'wb+') as fout:
    np.savez(fout, **features_vectors)
    

In [None]:
with open(NPY_OUTPUT_DIR + 'image_sizes.p', 'wb') as fout:
    pickle.dump(image_sizes, fout)

In [21]:
# load example
with open(NPY_OUTPUT_DIR + 'image_sizes.p', 'rb') as fin:
    b = pickle.load(fin)
    
    
    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ianmagnusson/IITUDND/data/extracted_features/resnet/harvey/filename.p'

# Next get unlabeled data

In [29]:
count = 0
features_vectors = {}
image_sizes = {}

for filename in os.listdir(UNLABELED_DIR):
    input_image = Image.open(UNLABELED_DIR + filename)
    image_sizes[filename] = input_image.size
    
    # remove alpha channel if transperency 
    # https://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
    if input_image.mode == 'RGBA':
        input_image = Image.fromarray(np.array(input_image)[:,:,:3], 'RGB') 
        
    # handle palleted images NOTE THIS WILL STILL RAISE WARNINGS but we are just removing the alpha channel
    # https://stackoverflow.com/questions/52307290/what-is-the-difference-between-images-in-p-and-l-mode-in-pil
    elif input_image.mode == 'P':
       input_image = input_image.convert('RGB')
    
    input_tensor = preprocess(input_image)
        
    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

    # move the input and model to GPU for speed if available
    if torch.cuda.is_available():
        input_batch = input_batch.to('cuda')
        model.to('cuda')
    
    with torch.no_grad():
        features_vectors[filename] = model(input_batch)[0].numpy()
    
    count += 1
    if count % 200 == 199:
        print(count)
        

199
399
599
799


  "Palette images with Transparency expressed in bytes should be "


KeyboardInterrupt: 

In [None]:
# save as a dictionary {filename : np.array}

with open(NPY_OUTPUT_DIR + 'labeled.npz', 'wb+') as fout:
    np.savez(fout, **features_vectors)
    

In [None]:
with open(NPY_OUTPUT_DIR + 'image_sizes.p', 'wb') as fout:
    pickle.dump(image_sizes, fout)

In [None]:
# load example
with open(NPY_OUTPUT_DIR + 'image_sizes.p', 'rb') as fin:
    b = pickle.load(fin)