<a href="https://colab.research.google.com/github/shubhomb/greenstand_data_analysis/blob/master/imnet/imnet_visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Authentication and Dependencies

In [None]:

# how to access GDrive https://colab.research.google.com/notebooks/io.ipynb#scrollTo=RWSJpsyKqHjH
from google.colab import files, drive
import os
drive.mount('/content/drive')
gdir = os.path.join(os.getcwd(), "drive", "My Drive")


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### Imports

In [None]:
import os
import numpy as np
from PIL import Image, ImageDraw
from xml.etree import ElementTree
import matplotlib.pyplot as plt
import pandas as pd

### ImageNet Data Directory and Bounding Box Parsing

In [None]:
path = "/content/drive/My Drive/data/imnet"
test_path = "/content/drive/My Drive/data/test_greenstand_samples"
img_dir = os.path.join(path, "original_images")
bb_dir = os.path.join(path, "bounding_boxes")


synsets = {
    "judas": "n12513613",
    "palm": "n12582231",
    "pine": "n11608250",
    "china tree": "n12741792",
    "fig": "n12401684",
    "cabbage": "n12478768",
    "cacao": "n12201580",
    "kapok": "n12190410",
    "iron": "n12317296",
    "linden": "n12202936",
    "pepper": "n12765115",
    "rain": "n11759853",
    "dita": "n11770256",
    "alder": "n12284262",
    "silk": "n11759404",
    "coral": "n12527738",
    "huisache": "n11757851",
    "fringe": "n12302071",
    "dogwood": "n12946849",
    "cork": "n12713866",
    "ginkgo": "n11664418",
    "golden shower": "n12492106",
    "balata": "n12774299",
    "baobab": "n12189987",
    "sorrel": "n12242409",
    "Japanese pagoda": "n12570394",
    "Kentucky coffee": "n12496427",
    "Logwood": "n12496949"
}

classes = list(synsets.keys())
imgs = []
class_imgs = {}
for i in classes:
  temp_imgs = list(sorted(os.listdir(os.path.join(img_dir, i))))
  temp_imgs = [os.path.join(img_dir, i, f) for f in temp_imgs if os.path.splitext(f)[1] != "tar"]
  class_imgs[i] = temp_imgs
  imgs += temp_imgs
bb_dict = {}
for f, _, d in os.walk(bb_dir):
  for file in d:
    if os.path.splitext(file)[1] == ".xml":
      tree = ElementTree.parse(os.path.join(f, file))
      root = tree.getroot()
      obj = root.find("object")
      b = obj.find("bndbox")
      xmin = int(b.find("xmin").text)
      ymin = int(b.find("ymin").text)
      xmax = int(b.find("xmax").text)
      ymax = int(b.find("ymax").text)
      bb_dict[os.path.join(f, file)] =  (xmin, ymin, xmax, ymax)



In [None]:
df_counts = {}
totimgs = 0
totboxed = 0
for k, v in synsets.items():
  totboxed += len([c for c in bb_dict.keys() if k in c])
  totimgs += len(class_imgs[k])
  df_counts[k] = [len([c for c in bb_dict.keys() if k in c]), len(class_imgs[k])]
print ("Total number of images: ", totimgs)
print (len(bb_dict.keys()), " bounding boxed images")

assert len(bb_dict.keys()) == totboxed
assert (len(imgs)) == totimgs

df_counts = pd.DataFrame(df_counts).T
df_counts.columns = ["box labeled", "total"]

## See image/label frequencies

In [None]:
df_counts

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
width = 0.35
x = np.arange(df_counts.shape[0])
rects1 = ax.bar(x - width/2, df_counts["box labeled"], width, label='Box Labeled')
rects2 = ax.bar(x + width/2, df_counts["total"], width, label='Total')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Frequency')
ax.set_title('Count ImageNet Data')
ax.set_xticks(x)
ax.set_xticklabels(df_counts.index, rotation=45)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()

In [None]:
import cv2
imgs = [cv2.resize(cv2.imread(img), (64, 64)) for img in imgs]

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

tsne = TSNE(n_components=2, random_state=0)
