<a href="https://colab.research.google.com/github/Mozuha/Oxford-Flower-102-Prediction/blob/master/oxflower_split_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
import tarfile
import scipy
from scipy import io
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image

In [None]:
DataPath = '/content/drive/My Drive/data'
LabelsPath = os.path.join(DataPath, 'labels.csv')
if not os.path.exists(DataPath):
  os.mkdir(DataPath)

###Get images and labels from URL

In [None]:
tgz_path = os.path.join(DataPath, '102flowers.tgz')
url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
r_image = requests.get(url)
with open(tgz_path, 'wb') as f:
  f.write(r_image.content)
tar = tarfile.open(tgz_path, 'r')
for item in tar:
  tar.extract(item, DataPath)

In [None]:
mat_path = os.path.join(DataPath, 'imagelabels.mat')
label_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
r_label = requests.get(label_url)
with open(mat_path, 'wb') as f:
  f.write(r_label.content)

###Associate image, label index, and label name

In [None]:
matdata = scipy.io.loadmat(mat_path)
labels = matdata['labels'][0]
images = ['image_{:05}.jpg'.format(i + 1) for i in range(len(labels))]
image_label_df = pd.DataFrame({'image': images, 'label': labels})
label_names_path = os.path.join(DataPath, 'label_names.csv')
label_names = pd.read_csv(label_names_path, index_col=0)
df = pd.merge(image_label_df, label_names, how='left', on='label')
csv_path = os.path.join(DataPath, 'image_label_name.csv')
df.to_csv(csv_path)

###Split images into train data and test data

In [None]:
X_train_path = os.path.join(DataPath, 'X_train')
X_test_path = os.path.join(DataPath, 'X_test')

In [None]:
labels = pd.read_csv(csv_path, index_col=0)
jpg_path = os.path.join(DataPath, 'jpg')
X_train, X_test, Y_train, Y_test = train_test_split(os.listdir(jpg_path), labels['name'], test_size=0.2, random_state=0)
if not os.path.exists(X_train_path):
  os.mkdir(X_train_path)
if not os.path.exists(X_test_path):
  os.mkdir(X_test_path)
for f in os.listdir(jpg_path):
  img = Image.open(os.path.join(jpg_path, f))
  if f in X_train:
    img.save(os.path.join(X_train_path, f))
  elif f in X_test:
    img.save(os.path.join(X_test_path, f))

###Split images into each category

In [None]:
for f in os.listdir(X_train_path):
  index = df.image[df.image==f].index
  category = str(df.name[index].values).replace('[', '').replace(']', '').replace("'", '')
  if category == '"colts foot"':
    category = "colt's foot"
  category_path = os.path.join(X_train_path, category)
  if not os.path.exists(category_path):
    os.makedirs(category_path)
  img = Image.open(os.path.join(X_train_path, f))
  img.save(os.path.join(category_path, f))
  os.remove(os.path.join(X_train_path, f))

In [None]:
for f in os.listdir(X_test_path):
  index = df.image[df.image==f].index
  category = str(df.name[index].values).replace('[', '').replace(']', '').replace("'", '')
  if category == '"colts foot"':
    category = "colt's foot"
  category_path = os.path.join(X_test_path, category)
  if not os.path.exists(category_path):
    os.makedirs(category_path)
  img = Image.open(os.path.join(X_test_path, f))
  img.save(os.path.join(category_path, f))
  os.remove(os.path.join(X_test_path, f))