# Image Filtering
Running use GPU time is essential for this

## 1. Install packages

In [1]:
#! pip install ftfy regex tqdm
#! pip install git+https://github.com/openai/CLIP.git

## 2. Importing packages

In [2]:
import clip
import os
import numpy as np
import torch
print("Torch version:", torch.__version__)
from glob import glob

Torch version: 1.10.0+cu111


## 3. Load the data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#Change directory
os.chdir('/content/drive/My Drive/CS Courses/CS156/Assignments/Final')

In [5]:
image_files = glob("cityscapes_resized_1024/*")
num_files = len(image_files)
print(f"There are {num_files} images")

There are 7952 images


## 4. Load the model

In [6]:
clip.available_models()

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']

In [7]:
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


## 5. Image Preprocessing
*We work on the images in batches of 1000/1500*

In [8]:
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image

from collections import OrderedDict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [9]:
#List to store the original and preprocessed images
original_images = []
images = []

#Iterate through all the files
for i, filename in enumerate(image_files[6500:]):
    
    #Open the image
    image = Image.open(filename).convert("RGB")

    #Store the original image
    original_images.append(image)

    #Store the preprocessed image
    images.append(preprocess(image))

    #Print and save progress over time
    if i%500 == 0:

      print(f"Finished preprocessing {i} images")

Finished preprocessing 0 images
Finished preprocessing 500 images
Finished preprocessing 1000 images


## 6. Building features

We normalize the images, tokenize the text input, and run the forward pass of the model to get the image and text features.

In [10]:
image_input = torch.tensor(np.stack(images)).cuda()
text_tokens = clip.tokenize("cityscape").cuda()

with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()

## 7. Calculate cosine similarity between images and word
Note from above. The word is 'cityscape'

In [11]:
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T

## 8. Determining the cut-off threshold
*Here I manually experiment with the similarity threshold to identify irrelevant images*

In [23]:
sim_thresh = np.array(similarity).flatten() < 0.215
#sim_thresh = np.logical_and(np.array(similarity).flatten() > 0.21,np.array(similarity).flatten() < 0.215)
not_cityscapes = [image_files[6500:][i] for i in range(len(sim_thresh)) if sim_thresh[i]]
len(not_cityscapes)

324

In [22]:
plt.figure(figsize=(30,30))
plt.tight_layout()
for idx, file in enumerate(not_cityscapes):
  plt.subplot(7,10,idx+1)
  img = Image.open(file)
  imgplot = plt.imshow(img)
  plt.axis("off")

plt.subplots_adjust(wspace=0, hspace=0)
plt.show()


Output hidden; open in https://colab.research.google.com to view.

In [24]:
#Save the files to be deleted
np.save("batch6.npy",not_cityscapes)

In [None]:
#np.load("batch3.npy")

---

# Get all images to be removed

In [28]:
images_to_remove = np.concatenate([np.load(f"batch{i}.npy") for i in range(1,7)])
len(images_to_remove)

2030

### Randomly display some images to be removed

In [33]:
plt.figure(figsize=(30,30))
plt.tight_layout()
for idx, file in enumerate(np.random.choice(images_to_remove,30)):
  plt.subplot(6,5,idx+1)
  img = Image.open(file)
  imgplot = plt.imshow(img)
  plt.axis("off")

plt.subplots_adjust(wspace=0, hspace=0)
plt.show()


Output hidden; open in https://colab.research.google.com to view.

### Move them from the file to trash(ish)

In [35]:
import shutil
for src_file in images_to_remove:
  dst_file = src_file.replace("cityscapes_resized_1024", "salon_des_refuses")
  shutil.move(src_file, dst_file)

In [36]:
image_files = glob("cityscapes_resized_1024/*")
num_files = len(image_files)
print(f"There are {num_files} images")

There are 5922 images


---

## Randomly display some of the photos left

In [38]:
plt.figure(figsize=(30,30))
plt.tight_layout()
for idx, file in enumerate(np.random.choice(image_files,30)):
  plt.subplot(6,5,idx+1)
  img = Image.open(file)
  imgplot = plt.imshow(img)
  plt.axis("off")

plt.subplots_adjust(wspace=0, hspace=0)
plt.show()


Output hidden; open in https://colab.research.google.com to view.

### Turns out there are duplicates!

In [43]:
sum(['(1)' in file for file in glob("cityscapes_resized_1024/*")])

130

In [44]:
duplicates = [file for file in glob("cityscapes_resized_1024/*") if '(1)' in file]

In [46]:
for src_file in duplicates:
  dst_file = src_file.replace("cityscapes_resized_1024", "salon_des_refuses")
  shutil.move(src_file, dst_file)

In [47]:
image_files = glob("cityscapes_resized_1024/*")
num_files = len(image_files)
print(f"There are {num_files} images")

There are 5756 images


---

Now I do some manual deletion