# Dataset Class for the Cub dataset
The data we will work on should be in the `preprocessing` directory, and should include the following:
1. `./images` directory, with all bird images divided into the 200 categories
2. `./text` directory, with all the bird images captions divided into the 200 categories
3. `./images.csv` - csv file containing the path, image_id, class id, and train/test affiliation of each image
4. `./classes.csv` - csv file containing a mapping between the class id and the class name

In [2]:
import os

import pandas as pd
from skimage import io
from PIL import Image
import pickle
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [31]:
import sys

In [32]:
sys.path.append("../")

In [33]:
from dataset.birds_dataset import BirdsDataset

In [4]:
base_dataset_path = "/home/user_2/AttnGAN/datasets"

dataset_interim_path = os.path.join(base_dataset_path, "interim")
dataset_preprocess_path = os.path.join(base_dataset_path, "preprocessing")

## Organizing unzipped files
All unzipped files are in the `interim` data folder and should be organized and moved to the `preprocessing` folder.
Follow the following 4 steps to get the right order of the `preprocessing folder`

### 0 
Unzip the `bird.zip` and `CUB_200_2011.tgz` to the `interim` directory and put everything in the base directory if you havn't already done so.

### 1
Move manually the `./images` with all the images folder to `preprocessing` folder

### 2
Move manually the `./text` folder with all the text captions for each image to the `preprocessing` folder

### 3
Saving the images as .csv file

In [5]:
images_df = pd.read_csv(os.path.join(dataset_interim_path, "images.txt"), sep=" ", names=["image_id", "path"])

In [6]:
images_df.head()

Unnamed: 0,image_id,path
0,1,001.Black_footed_Albatross/Black_Footed_Albatr...
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...
2,3,001.Black_footed_Albatross/Black_Footed_Albatr...
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...


In [7]:
# Create another column to hold the class id
images_df["class_id"] = images_df.apply(lambda p : int(p.path[:3]), axis=1)

# remove the .jpg from the end of the path
images_df["path"] = images_df.apply(lambda p : p.path[:-4], axis=1)

In [8]:
# Create another column to hold the train/test affiliation of the image
# 1 - The image is in the training set.
# 0 - the image is in the test set.
train_test_split = pd.read_csv(os.path.join(dataset_interim_path, "train_test_split.txt"), sep=" ", names=["img_id", "is_train"])
images_df["is_train"] = train_test_split.is_train

In [9]:
images_df.head()

Unnamed: 0,image_id,path,class_id,is_train
0,1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
2,3,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1


In [10]:
# Save the final unified csv file
images_df.to_csv(os.path.join(dataset_preprocess_path, "images.csv"))

#### 4
Saving the class name to class index mapping as csv file

In [11]:
classes_df = pd.read_csv(os.path.join(dataset_interim_path, "classes.txt"), sep=" ", names=["class_id", "class_name"])
classes_df.to_csv(os.path.join(dataset_preprocess_path, "classes.csv"))

### Some more metadata
- [ ] What is it used for?

In [94]:
with open (os.path.join(dataset_interim_path, "train", "filenames.pickle"), "rb") as f:
    train_filenames = pickle.load(f)

with open (os.path.join(dataset_interim_path, "train", "class_info.pickle"), "rb") as f1:
    train_class_info = pickle.load(f1, encoding="latin1")

with open (os.path.join(dataset_interim_path, "test", "filenames.pickle"), "rb") as f:
    test_filenames = pickle.load(f)

with open (os.path.join(dataset_interim_path, "test", "class_info.pickle"), "rb") as f:
    test_class_info = pickle.load(f)

In [111]:
# To use a pre-trained model:
# with open(os.path.join(dataset_interim_path, "captions.pickle"), "rb") as f:
#     captions = pickle.load(f)

## Dataset class to help load the data
Base path should be after the initial one-time ordering of the data folder.  
Ordered data that should be taken by this class should be in the `preprocessing` folder.

In [12]:
classes_df = pd.read_csv(os.path.join(dataset_preprocess_path, "classes.csv")).drop("Unnamed: 0", axis=1)

In [13]:
classes_df.head()

Unnamed: 0,class_id,class_name
0,1,001.Black_footed_Albatross
1,2,002.Laysan_Albatross
2,3,003.Sooty_Albatross
3,4,004.Groove_billed_Ani
4,5,005.Crested_Auklet


In [14]:
images_df = pd.read_csv(os.path.join(dataset_preprocess_path, "images.csv")).drop("Unnamed: 0", axis=1)

In [15]:
images_df.head()

Unnamed: 0,image_id,path,class_id,is_train
0,1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
2,3,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1


In [16]:
total_sampels = len(images_df)
num_train = images_df.is_train.sum()
num_test = total_sampels - num_train


In [17]:
print("There are {:,} training samples, and {:,} test samples".format(num_train, num_test))

There are 5,994 training samples, and 5,794 test samples


In [18]:
5994  + 5794

11788

In [19]:
train_df = images_df[images_df.is_train == 1].drop("is_train", axis=1)

In [20]:
train_df.head()

Unnamed: 0,image_id,path,class_id
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,1
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,1
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,1
6,7,001.Black_footed_Albatross/Black_Footed_Albatr...,1
7,8,001.Black_footed_Albatross/Black_Footed_Albatr...,1


In [21]:
image_transforms = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])

In [34]:
bird_dataset = BirdsDataset(dataset_preprocess_path, split="train", image_transform=image_transforms)

In [66]:
bird_dataset.vocabulary.itos[3]

'<pad>'

In [35]:
dataloader = DataLoader(bird_dataset, shuffle=True, batch_size=4)

In [37]:
batch = next(iter(dataloader))

In [38]:
batch

{'caption': tensor([[  1,  10, 213,   6,  13,  83,   7,   9,   5,  18, 164,   5,   8,   4,
           30,  14,   2,   2,   2,   2],
         [  1,  10,   6,   8,   4,  30,  71,  20,   4,  11,  24,   5,   4,  11,
           37,   2,   2,   2,   2,   2],
         [  1,  10,   6,  13,  49,   7,  33,   5,   8,   4,  43,  30,  14,   2,
            2,   2,   2,   2,   2,   2],
         [  1,   4,  16,  50,  49,   6,   7, 114,  49,  78,   5,   4,  45,   9,
           20,   2,   2,   2,   2,   2]]),
 'caption_length': tensor([15, 14, 12, 14]),
 'class_id': tensor([176,  45,  15, 153]),
 'image': tensor([[[[0.3843, 0.3961, 0.4000,  ..., 0.3216, 0.3294, 0.3412],
           [0.4000, 0.3961, 0.4000,  ..., 0.3137, 0.3333, 0.3294],
           [0.3882, 0.3882, 0.4039,  ..., 0.3216, 0.3176, 0.3137],
           ...,
           [0.4039, 0.5255, 0.6235,  ..., 0.2824, 0.2902, 0.2902],
           [0.4549, 0.5686, 0.5765,  ..., 0.2784, 0.2980, 0.2863],
           [0.4196, 0.5137, 0.5216,  ..., 0.3608, 0.309

In [39]:
print(batch["image"].shape)
print(batch["image_id"])
print(batch["class_id"])
print(batch["caption"][0])

torch.Size([4, 3, 224, 224])
tensor([10350,  2535,   790,  8970])
tensor([176,  45,  15, 153])
tensor([  1,  10, 213,   6,  13,  83,   7,   9,   5,  18, 164,   5,   8,   4,
         30,  14,   2,   2,   2,   2])


## Clean the bad text sentences
Run all these to clean bad lines when done creatign the preprocessing directory.  
similary, can just load the bad_files.pkl.gz and use it with the last cell

In [9]:
import os

In [15]:
import re

In [47]:
import joblib 

In [7]:
import sys
sys.path.insert(0, "../../")
from dataset.preprocessing import clean_text_data

In [39]:
collapse_unknown_chars_regex = re.compile(r'[��]+')
# collapse_unknown_chars_regex.findall("this��bird��has��a��yellow-green��belly,��a��black��crown��and��a��short,��pointy��bill.")

['��', '��', '��', '��', '��', '��', '��', '��', '��', '��', '��', '��', '��']

In [48]:
base_path = "/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text"
collapse_unknown_chars_regex = re.compile(r'[��]+')
bad_files = []
for dirname in os.listdir(base_path):
    for filename in os.listdir(os.path.join(base_path, dirname)):
        full_filepath = os.path.join(base_path, dirname, filename)
        with open(full_filepath, 'r') as f:
            if any([collapse_unknown_chars_regex.findall(line) for line in f.readlines()]):
                bad_files.append(full_filepath)
joblib.dump(bad_files, "./list_of_bad_filenames.pkl.gz")

['./list_of_bad_filenames.pkl.gz']

In [51]:
len(bad_files)

56

In [52]:
collapse_unknown_chars_regex = re.compile(r'[��]+')
for bad_file in bad_files:
    new_lines = []
    with open(bad_file, "r") as f:
        bad_lines = f.readlines()
        for bad_line in bad_lines:
            new_lines.append(collapse_unknown_chars_regex.sub(' ', bad_line))
    os.remove(bad_file)
    with open(bad_file, "w") as f2:
        for good_line in new_lines:
            f2.write(good_line)

In [54]:
for file in bad_files:
    print(file)

/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/177.Prothonotary_Warbler/Prothonotary_Warbler_0083_173929.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/177.Prothonotary_Warbler/Prothonotary_Warbler_0106_174221.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/055.Evening_Grosbeak/Evening_Grosbeak_0022_37761.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/067.Anna_Hummingbird/Anna_Hummingbird_0070_56085.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/038.Great_Crested_Flycatcher/Great_Crested_Flycatcher_0016_29406.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/187.American_Three_toed_Woodpecker/American_Three_Toed_Woodpecker_0014_179882.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/009.Brewer_Blackbird/Brewer_Blackbird_0079_2343.txt
/home/user_2/AttnGAN/datasets/cub200-2011/preprocessing/text/009.Brewer_Blackbird/Brewer_Blackbird_0115_2279.txt
/home/user_2/AttnGAN/datase

### Unrelated

In [52]:
t = torch.randn([2,10])

In [57]:
t.shape[0]

2

In [51]:
torch.cat?

In [61]:
t.numpy()

array([[ 1.6461657 , -0.5135788 , -0.8496051 , -0.49303126, -0.26290402,
         1.7020756 ,  1.3532641 , -1.3082935 ,  0.01862434, -0.01803868],
       [ 0.6297136 , -1.4580991 , -0.11121239, -0.51820064,  1.6179426 ,
        -0.09860552, -1.5733247 , -0.8887586 , -0.5602951 , -0.09386935]],
      dtype=float32)