# Research dataset
stairtのデータセットとオリジナルのデータセットを合わしたいのでそのための調査スクリプト

In [93]:
%matplotlib inline
import os
import pickle
import json
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [2]:
TRAIN_IMAGES_DIRECTORY = "/home/nakachi/data/stair/train2014"
TRAIN_ANNOTATIONS_PATH = "/home/nakachi/data/stair/stair_captions_v1.2_train_tokenized.json"

coco = COCO(TRAIN_ANNOTATIONS_PATH)

img_ids = 384553
annotation_ids = coco.getAnnIds(imgIds=img_ids)
print(annotation_ids)
annotations = coco.loadAnns(annotation_ids)

for i in range(len(annotations)):
    entity_id = annotations[i]["category_id"]
    entity = coco.loadCats(entity_id)[0]["name"]
    print('hi')
    print("{}: {}".format(i, entity))


# image_meta = coco.loadImgs(annotations[i]["image_id"])[0]
# image_path = os.path.join(TRAIN_IMAGES_DIRECTORY, image_meta["file_name"])

# I = io.imread(image_path)
# plt.imshow(I)
# coco.showAnns(annotations)
# plt.show()

loading annotations into memory...
Done (t=2.30s)
creating index...
index created!
[]


---

stairtとオリジナルデータセットのアノテーションの差分を調べる。
差分があった場合、Stairに合わせたデータセットをつくる

In [3]:
STAIR_VAL_ANNOTATIONS_PATH = "/home/nakachi/data/stair/stair_captions_v1.2_val_tokenized.json"
INSTA_VAL_ANNOTATIONS_PATH = "/home/nakachi/data/coco2014/annotations/instances_val2014.json"
stair_val_coco = COCO(STAIR_VAL_ANNOTATIONS_PATH)
insta_val_coco = COCO(INSTA_VAL_ANNOTATIONS_PATH)

loading annotations into memory...
Done (t=0.68s)
creating index...
index created!
loading annotations into memory...
Done (t=4.88s)
creating index...
index created!


In [None]:
insta_val_coco.cats

In [4]:
print('stair val img length:', len(stair_val_coco.imgs))
print('insta val img length:', len(insta_val_coco.imgs))

stair val img length: 40504
insta val img length: 40504


In [6]:
print('stair val anns length:', len(stair_val_coco.anns))
print('insta val anns length:', len(insta_val_coco.anns))

stair val anns length: 202520
insta val anns length: 291875


上記を見て分かるように、imgの数は合っているが、
アノテーションの数が違う。

stairは１画像に対して、5captionあるから合っている。

instaがわからんので調べる

In [7]:
images_anns = dict()
for value in insta_val_coco.anns.values():
    image_id = value['image_id']
    pre_ann_num = images_anns.get(image_id, 0)
    images_anns[image_id] = pre_ann_num + 1

In [None]:
images_anns

annotionがない画像もあるやんけ〜〜〜〜

しかも1画像あたりに対して、めっちゃannotationがあるのもある...

ないやつはシカトでOKだけど、あるやつどうしようかな

いや待て、これはinstaだからか！？
ここはカテゴリだけ、使えればよくて本来のcaptionの方を調べる

In [None]:
stair_val_coco.anns

---

captionの概観を掴む

In [12]:
STAIR_VAL_ANNOTATIONS_PATH = "/home/nakachi/data/stair/stair_captions_v1.2_val_tokenized.json"
CAPTION_VAL_ANNOTATIONS_PATH = "/home/nakachi/data/coco2014/annotations/captions_val2014.json"
stair_val_coco = COCO(STAIR_VAL_ANNOTATIONS_PATH)
caption_val_coco = COCO(CAPTION_VAL_ANNOTATIONS_PATH)

loading annotations into memory...
Done (t=0.72s)
creating index...
index created!
loading annotations into memory...
Done (t=0.33s)
creating index...
index created!


In [13]:
caption_val_coco.cats

{}

In [14]:
print('stair val anns length:', len(stair_val_coco.anns))
print('caption val anns length:', len(caption_val_coco.anns))

stair val anns length: 202520
caption val anns length: 202654


In [15]:
# 1画像あたりのannotation数
images_anns = dict()
for value in caption_val_coco.anns.values():
    image_id = value['image_id']
    pre_ann_num = images_anns.get(image_id, 0)
    images_anns[image_id] = pre_ann_num + 1

In [17]:
# 最低5captionは必要なのでそれがあるか確認
for k, v in images_anns.items():
    if v < 5:
        print(k, v)

---

カテゴリ別でデータセットを分けたい。
そのために以下のような辞書を作る
```
cat_img_dic = {id: {'name': 'person', 'image_ids: [...]},
               id: {'name': 'person', 'image_ids: [...]},
               ...
              }

```
また、最初は1 imageにつき1 categoryにしようと思ったが、
別にそうしないことにした。（たくさんあった方がいい）

ただし、image_idsはユニークをとる

In [122]:
INSTA_VAL_ANNOTATIONS_PATH = "/home/nakachi/data/coco2014/annotations/instances_val2014.json"
insta_val_coco = COCO(INSTA_VAL_ANNOTATIONS_PATH)

loading annotations into memory...
Done (t=5.48s)
creating index...
index created!


In [124]:
INSTA_TRAIN_ANNOTATIONS_PATH = "/home/nakachi/data/coco2014/annotations/instances_train2014.json"
insta_train_coco = COCO(INSTA_TRAIN_ANNOTATIONS_PATH)

loading annotations into memory...
Done (t=11.33s)
creating index...
index created!


In [125]:
len(insta_train_coco.cats)

80

In [54]:
cat_img_dic = dict()
# 最初にカテゴリーのidとname, image_idsの空配列を最初に作る
for cat_value in insta_val_coco.cats.values():
    cat_img_dic[cat_value['id']] = {'name': cat_value['name'], 'image_ids': []}

In [55]:
# 次に各カテゴリーに対応するimage_idを追加していく
# NOTE: ひとつの画像に足して複数のカテゴリーがある場合がある
for ann_dic_val in insta_val_coco.anns.values():
    image_id = ann_dic_val['image_id']
    category_id = ann_dic_val['category_id']
    cat_img_dic[category_id]['image_ids'].append(image_id)

In [56]:
# image_idsをユニークにする
print('減る前の確認. personの数は= ', len(cat_img_dic[1]['image_ids']))
for k, v in cat_img_dic.items():
    cat_img_dic[k]['image_ids'] = list(set(cat_img_dic[k]['image_ids']))

print('減ったか確認. personの数は= ', len(cat_img_dic[1]['image_ids']))

減る前の確認. personの数は=  88153
減ったか確認. personの数は=  21634


In [None]:
# できた！！！ので各カテゴリーにどれぐらいのimageがあるか確認
for cat_val in cat_img_dic.values():
    print('カテゴリ名 = ', cat_val['name'])
    print('imageの数 = ', len(cat_val['image_ids']))
    print('--------------------')

---

実際に分けていく。valとtrainで分ける。
1. cat_img_dicを作成する
1. caption.jsonを読み込んで、対応するimageの名前とcaptionに対応した辞書を作る
1. annotationをappendする

```
category_split
├── person_en
│   ├── train_filenames.pickle
│   ├── val_filenames.pickle
│   └── text
│       └── COCO_VAL_0000.txt
└── person_ja
    ├── train_filenames.pickle
    ├── val_filenames.pickle
    └── text
        └── COCO_VAL_0000.txt
```


In [135]:
'''
cat_images_dicを作成する関数
cat_images_dic = {id: {'name': 'person', 'image_ids: [...]},
                  id: {'name': 'person', 'image_ids: [...]},
                  ...
                  }
'''
def make_cat_images_dic(insta_path):
    insta_coco = COCO(insta_path)


    cat_images_dic = dict()
    # 最初にカテゴリーのidとname, image_idsの空配列を最初に作る
    for cat_value in insta_coco.cats.values():
        cat_images_dic[cat_value['id']] = {'name': cat_value['name'], 'image_ids': []}


    # 次に各カテゴリーに対応するimage_idを追加していく
    # NOTE: ひとつの画像に足して複数のカテゴリーがある場合がある
    for ann_dic_val in insta_coco.anns.values():
        image_id = ann_dic_val['image_id']
        category_id = ann_dic_val['category_id']
        cat_images_dic[category_id]['image_ids'].append(image_id)

    # image_idsをユニークにする
    for k, v in cat_images_dic.items():
        cat_images_dic[k]['image_ids'] = list(set(cat_images_dic[k]['image_ids']))
    
    return cat_images_dic

In [136]:
'''
imageとcaptionが対応したdictを作成する関数
image_captions_dic = {id: {'name': file_name, 'captions': [...]},
                      id: {'name': file_name, 'captions': [...]},
                      ...
                      }
'''

def make_image_captions_dic(caption_path):
    # json読み込み
    with open(caption_path, encoding='utf-8') as f:
        json_data = json.load(f)
    
    image_captions = {}
    for i in range(len(json_data["images"])):
        file_name = json_data["images"][i]["file_name"]
        image_id = json_data["images"][i]["id"]
        image_captions[image_id] = {'name': file_name.replace('.jpg', ''), 'captions': []}
    
    for annotation in json_data['annotations']:
        image_id = annotation['image_id']
        if 'tokenized_caption' in annotation:
            image_captions[image_id]['captions'].append(annotation['tokenized_caption'])
        elif 'caption' in annotation:
            image_captions[image_id]['captions'].append(annotation['caption'])
    
    return image_captions

In [137]:
def make_text_data(image_ids, image_captions_dic, text_dir_path):
    for image_id in image_ids:
        w_captions = "\n".join(image_captions_dic[image_id]['captions'])
        save_text_path = os.path.join(text_dir_path, image_captions_dic[image_id]['name'] + '.txt')
        with open(save_text_path, 'w', encoding='utf-8') as f:
            f.write(w_captions)

In [138]:
def make_filenames_pickle(image_ids, image_captions_dic, data_dir_path, train_or_val):
    filename_list = []
    for image_id in image_ids:
        filename_list.append(image_captions_dic[image_id]['name'])
    
    save_filename_path = os.path.join(data_dir_path, train_or_val + '_filenames.pickle')
    with open(save_filename_path, 'wb') as f:
        pickle.dump(filename_list,f)

### validation data

In [139]:
SPLIT_DIR_PATH = '/home/nakachi/data/category_split'

In [140]:
val_insta_path = '/home/nakachi/data/coco2014/annotations/instances_val2014.json'
val_en_caption_path = '/home/nakachi/data/coco2014/annotations/captions_val2014.json'
val_ja_caption_path = '/data/Users/nakachi/stair/stair_captions_v1.2_val_tokenized.json'

val_cat_images_dic = make_cat_images_dic(val_insta_path)
val_en_image_captions_dic = make_image_captions_dic(val_en_caption_path)
val_ja_image_captions_dic = make_image_captions_dic(val_ja_caption_path)

loading annotations into memory...
Done (t=7.36s)
creating index...
index created!


In [141]:
for cat_images in val_cat_images_dic.values():
    # en: オリジナル
    EN_DATA_DIR = os.path.join(SPLIT_DIR_PATH, cat_images['name'] + '_en')
    EN_TEXT_DIR = os.path.join(EN_DATA_DIR, 'text')
    os.makedirs(EN_TEXT_DIR, exist_ok=True)
    make_text_data(cat_images['image_ids'], val_en_image_captions_dic, EN_TEXT_DIR)
    make_filenames_pickle(cat_images['image_ids'], val_en_image_captions_dic, EN_DATA_DIR, 'val')
    
    
    # ja: stair
    JA_DATA_DIR = os.path.join(SPLIT_DIR_PATH, cat_images['name'] + '_ja')
    JA_TEXT_DIR = os.path.join(JA_DATA_DIR, 'text')
    os.makedirs(JA_TEXT_DIR, exist_ok=True)
    make_text_data(cat_images['image_ids'], val_ja_image_captions_dic, JA_TEXT_DIR)
    make_filenames_pickle(cat_images['image_ids'], val_ja_image_captions_dic, JA_DATA_DIR, 'val')

### train data

In [142]:
train_insta_path = '/home/nakachi/data/coco2014/annotations/instances_train2014.json'
train_en_caption_path = '/home/nakachi/data/coco2014/annotations/captions_train2014.json'
train_ja_caption_path = '/data/Users/nakachi/stair/stair_captions_v1.2_train_tokenized.json'

train_cat_images_dic = make_cat_images_dic(train_insta_path)
train_en_image_captions_dic = make_image_captions_dic(train_en_caption_path)
train_ja_image_captions_dic = make_image_captions_dic(train_ja_caption_path)

loading annotations into memory...
Done (t=10.85s)
creating index...
index created!


In [143]:
for cat_images in train_cat_images_dic.values():
    # en: オリジナル
    EN_DATA_DIR = os.path.join(SPLIT_DIR_PATH, cat_images['name'] + '_en')
    EN_TEXT_DIR = os.path.join(EN_DATA_DIR, 'text')
    os.makedirs(EN_TEXT_DIR, exist_ok=True)
    make_text_data(cat_images['image_ids'], train_en_image_captions_dic, EN_TEXT_DIR)
    make_filenames_pickle(cat_images['image_ids'], train_en_image_captions_dic, EN_DATA_DIR, 'train')
    
    
    # ja: stair
    JA_DATA_DIR = os.path.join(SPLIT_DIR_PATH, cat_images['name'] + '_ja')
    JA_TEXT_DIR = os.path.join(JA_DATA_DIR, 'text')
    os.makedirs(JA_TEXT_DIR, exist_ok=True)
    make_text_data(cat_images['image_ids'], train_ja_image_captions_dic, JA_TEXT_DIR)
    make_filenames_pickle(cat_images['image_ids'], train_ja_image_captions_dic, JA_DATA_DIR, 'train')

---

できた！！！
次はデータの統計を掴む

In [148]:
import pandas as pd

In [155]:
# pandasに適用するためにそれぞれのlistを作る
train_list = []
val_list = []

for train_value in train_cat_images_dic.values():
    append_list = [train_value['name'], len(train_value['image_ids'])]
    train_list.append(append_list)
    
for val_value in val_cat_images_dic.values():
    append_list = [val_value['name'], len(val_value['image_ids'])]
    val_list.append(append_list)

train_df = pd.DataFrame(train_list)
val_df = pd.DataFrame(val_list)
train_df.columns = ['name', 'number of image']
val_df.columns = ['name', 'number of image']

In [159]:
train_df_s = train_df.sort_values('number of image', ascending=False)
print(train_df_s)

             name  number of image
0          person            45174
56          chair             8950
2             car             8606
60   dining table             8378
41            cup             6518
..            ...              ...
76       scissors              673
21           bear              668
12  parking meter              481
70        toaster              151
78     hair drier              128

[80 rows x 2 columns]


In [160]:
val_df_s = val_df.sort_values('number of image', ascending=False)
print(val_df_s)

             name  number of image
0          person            21634
56          chair             4404
2             car             4180
60   dining table             3960
41            cup             3061
..            ...              ...
21           bear              341
76       scissors              302
12  parking meter              261
70        toaster               74
78     hair drier               70

[80 rows x 2 columns]
