In [1]:
import numpy as np
import os
import pandas as pd

from matplotlib import image as mpimg
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from PIL import Image, ImageOps
from sklearn.datasets import fetch_openml


def rgb2gray(rgb):
    return np.dot(rgb[...,:3].astype(float), [0.2989, 0.5870, 0.1140])

### MNIST Preprocessing

In [2]:
mnist = fetch_openml('mnist_784')

X = mnist.data / 255.0
y = mnist.target
print(X.shape, y.shape)

(70000, 784) (70000,)


In [3]:
print(y)

['5' '0' '4' ... '4' '5' '6']


In [4]:
feat_cols = [f'pixel{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X,columns=feat_cols)
df['label'] = y

In [5]:
N = 10000

df_subset = df.sample(N)
df_subset.to_csv('mnist_subset.csv')

In [6]:
print(df_subset.shape)
df_subset.head()

(10000, 785)


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,label
3698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
27686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
34201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
6155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
29915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


### Caltech-101 Preprocessing

First, we take top-10 most popular image classes.

In [7]:
categories = []

for category in os.listdir('caltech_101/101_ObjectCategories'):
    num_examples = len(os.listdir(f'caltech_101/101_ObjectCategories/{category}'))
    categories.append((category, num_examples))
    
top_categories = sorted(categories, key=lambda x: -x[1])

print('images num', sum(pair[1] for pair in top_categories[:10]))
print(top_categories[:10])

images num 3379
[('airplanes', 800), ('Motorbikes', 798), ('Faces_easy', 435), ('Faces', 435), ('watch', 239), ('Leopards', 200), ('bonsai', 128), ('car_side', 123), ('ketch', 114), ('chandelier', 107)]


#### Grey images

In [11]:
image_size = (32, 32)

X = []
y = []

for top_category in top_categories[:5]:
    top_category = top_category[0]
    cur_dir = f'caltech_101/101_ObjectCategories/{top_category}'
    
    for filename in os.listdir(cur_dir):
        img = Image.open(os.path.join(cur_dir, filename))
        gray = ImageOps.grayscale(img).resize(image_size)
        X.append(np.array(gray).flatten())
        y.append(top_category)

In [12]:
feat_cols = [f'pixel{i}' for i in range(np.prod(image_size))]
df = pd.DataFrame(X,columns=feat_cols)
df['label'] = y
df.to_csv('caltech_101.csv')
print(df.shape)
df.head()

(2707, 1025)


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel1015,pixel1016,pixel1017,pixel1018,pixel1019,pixel1020,pixel1021,pixel1022,pixel1023,label
0,255,164,104,82,133,131,104,130,89,71,...,70,68,69,70,67,66,63,56,143,airplanes
1,255,209,167,167,169,176,169,173,178,172,...,84,86,85,84,84,86,92,97,93,airplanes
2,255,255,255,134,51,94,85,79,16,16,...,117,119,121,122,126,119,141,252,255,airplanes
3,123,137,140,142,143,142,144,144,144,145,...,166,92,87,89,85,93,129,135,97,airplanes
4,255,138,130,117,45,68,102,58,60,64,...,84,91,85,77,75,69,65,56,108,airplanes


#### Separate channels for multi-view visualization

In [9]:
image_size = (32, 32)

X = [[] for _ in range(3)]
y = []

for top_category in top_categories[:5]:
    top_category = top_category[0]
    cur_dir = f'caltech_101/101_ObjectCategories/{top_category}'
    
    for filename in os.listdir(cur_dir):
        img = Image.open(os.path.join(cur_dir, filename)).resize(image_size)
        img = np.array(img)
        
        if len(img.shape) != 3:
            continue
        
        for i in range(3):
            X[i].append(img[:, :, i].flatten())
        y.append(top_category)

In [10]:
for i in range(3):
    feat_cols = [f'pixel{i}' for i in range(np.prod(image_size))]
    df = pd.DataFrame(X[i], columns=feat_cols)
    df['label'] = y
    df.to_csv(f'caltech_101_view{i}.csv')

#### All the RGB channels in one dataset

In [None]:
image_size = (32, 32)

X = []
y = []

for top_category in top_categories[:5]:
    top_category = top_category[0]
    cur_dir = f'caltech_101/101_ObjectCategories/{top_category}'
    
    for filename in os.listdir(cur_dir):
        img = Image.open(os.path.join(cur_dir, filename)).resize(image_size)
        img = np.array(img)
        
        if len(img.shape) != 3:
            continue
        
        X.append(img.flatten())
        y.append(top_category)

In [None]:
feat_cols = [f'pixel{i}' for i in range(np.prod(image_size))]
df = pd.DataFrame(X,columns=feat_cols)
df['label'] = y
df.to_csv('caltech_101_colored.csv')