# Classify images using random forests

In [62]:
from pathlib import Path
from PIL import Image

X = []
y = []

dogs_and_cats_images_training_path = Path("DogsAndCatsImages") / "training"

for i in range(1, 51):
  with Image.open(str(dogs_and_cats_images_training_path / ('cat.'+str(i)+'.jpg'))) as img:
      X.append(img.histogram())
      y.append(1)

for i in range(1, 51):
  with Image.open(str(dogs_and_cats_images_training_path / ('dog.'+str(i)+'.jpg'))) as img:
      X.append(img.histogram())
      y.append(2)

print(len(X))
print(len(X[0]))

100
768


In [63]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

In [64]:
# Cross-validate a random forest classifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

print(np.mean(cross_val_score(RandomForestClassifier(max_depth=4, random_state=10), X_scaled, y, cv=5)))

0.53


In [65]:
# Reduce dimensionality
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_scaled)
print(pca.explained_variance_ratio_.cumsum())

[0.22835857 0.40977661 0.5457806  0.62881169 0.68974109 0.71952338
 0.7487644  0.77102386 0.79117244 0.81021697 0.82731917 0.8415295
 0.85467005 0.86620064 0.87667168 0.88695468 0.89667946 0.90510932
 0.91266292 0.91993872 0.92557235 0.93056193 0.93541631 0.93968215
 0.94384522 0.94767596 0.95114665 0.9543019  0.95733993 0.96014738
 0.96280187 0.96531222 0.96763773 0.9697956  0.97174497 0.97362125
 0.97531968 0.97695009 0.9783795  0.97978388 0.98112954 0.98239436
 0.98358406 0.98472947 0.98574155 0.98669875 0.9876021  0.98844765
 0.98927199 0.99001883 0.9907463  0.99141086 0.99207148 0.99267825
 0.9932162  0.99370587 0.99417406 0.99463722 0.99509402 0.99550804
 0.99586707 0.99621507 0.99653902 0.99683135 0.99709952 0.99734696
 0.99758359 0.99780159 0.99800472 0.99819773 0.99837129 0.99853055
 0.99867511 0.99881564 0.99894124 0.99905426 0.99916193 0.99925687
 0.99934544 0.99942641 0.99950532 0.99957222 0.99963735 0.99969628
 0.99974734 0.99979406 0.99983534 0.99987181 0.99990567 0.99993

In [66]:
pca = PCA(n_components=15, random_state=10)
X_pca = pca.fit_transform(X_scaled)

In [67]:
print(len(X_pca))
print(len(X_pca[0]))

100
15


In [68]:
# Cross-validate on the reduced features
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

print(np.mean(cross_val_score(RandomForestClassifier(max_depth=4, random_state=10), X_pca, y, cv=5)))

0.56


# Challenge: predict the images in `test/`