Team: Laurel Newman and Dhruv Sawhney

## Dog Breed Identification

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [None]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

`PATH` is the path to your data—if you use the recommended setup approaches from the lesson, you won't need to change this. `sz` is the size that the images will be resized to in order to ensure that the training runs quickly. We'll be talking about this parameter a lot during the course. Leave it at `224` for now.

In [None]:
arch=resnet34
PATH = "../../data/competitions/dog-breed-identification/"
sz=224

## First look inside data directory

In [None]:
os.listdir(PATH)

['test',
 'labels.csv.zip',
 'test.zip',
 'models',
 'sample_submission.csv.zip',
 'sample_submission.csv',
 'train',
 'labels.csv',
 'train.zip',
 'tmp']

## Training the Model

In [None]:
label_csv = f'{PATH}labels.csv'
n = len(list(open(label_csv)))-1
# gets indices of a random 20% of the training data to use as validation data 
# from label.csv
val_idxs = get_cv_idxs(n)

In [None]:
# from_csv handles the internals of mapping the labels and indices to the jpg
# training images (note that it also considers our random validation images)
# We used https://medium.com/@hiromi_suenaga/deep-learning-2-part-1-lesson-2-eeae2edd2be4
# to understand the library
data = ImageClassifierData.from_csv(PATH, 'train', 
                 f'{PATH}labels.csv', test_name='test', 
                 val_idxs=val_idxs, suffix='.jpg', tfms=tfms_from_model(arch, sz))

In [None]:
# We tried five learning rates and settled on this one as working the best within
# a reasonably small number of epochs
# Our benchmark was two things: high accuracy (to prevent underfitting) and
# minimal change from training loss to validation loss (to prevent overfitting)
learn = ConvLearner.pretrained(arch, data, precompute=True)
learn.fit(0.01, 5)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      2.144187   1.068618   0.76272   
    1      1.084871   0.695777   0.812622  
    2      0.767042   0.607577   0.82045   
    3      0.647296   0.566311   0.829256  
    4      0.575839   0.539291   0.830724  



[array([0.53929]), 0.8307240702168116]

## Prediciting on test data and writing it out


In [None]:
# check the format of the data filenames
data.test_ds.fnames[:10]

['test/a9a33f0585e5af26dffd6dc0a00ba88f.jpg',
 'test/2e1d705a46d57f75f2572a9e23b162d6.jpg',
 'test/332eff8d1e8f55a0e7335a1552f681ef.jpg',
 'test/74b133bbc2d4ece626394bdec13d702a.jpg',
 'test/b7e6c5746e5fc25b72ef66a61bee779a.jpg',
 'test/17f840f8bfc7ef60eec37f9ee52c9517.jpg',
 'test/da11b66ad92ddfc032e4ec4898fc4757.jpg',
 'test/c3bfcaf712a23d793c02737a9007a971.jpg',
 'test/4bed4a04a5044e14f3dc575968265fa0.jpg',
 'test/463584a0691d71068ed4f92632aef176.jpg']

Let's make predictions for that test data.  Remember that we're given log probabilities which we must 
exponentiate in order to yield true probabilities.

In [None]:
log_preds, y = learn.predict_with_targs(is_test=True) # use test dataset rather than validation dataset
probs = np.exp(log_preds)

In [None]:
probs.shape # (n_images, n_classes)

(10357, 120)

In [None]:
# check the labels for all the classes
data.classes

['affenpinscher',
 'afghan_hound',
 'african_hunting_dog',
 'airedale',
 'american_staffordshire_terrier',
 'appenzeller',
 'australian_terrier',
 'basenji',
 'basset',
 'beagle',
 'bedlington_terrier',
 'bernese_mountain_dog',
 'black-and-tan_coonhound',
 'blenheim_spaniel',
 'bloodhound',
 'bluetick',
 'border_collie',
 'border_terrier',
 'borzoi',
 'boston_bull',
 'bouvier_des_flandres',
 'boxer',
 'brabancon_griffon',
 'briard',
 'brittany_spaniel',
 'bull_mastiff',
 'cairn',
 'cardigan',
 'chesapeake_bay_retriever',
 'chihuahua',
 'chow',
 'clumber',
 'cocker_spaniel',
 'collie',
 'curly-coated_retriever',
 'dandie_dinmont',
 'dhole',
 'dingo',
 'doberman',
 'english_foxhound',
 'english_setter',
 'english_springer',
 'entlebucher',
 'eskimo_dog',
 'flat-coated_retriever',
 'french_bulldog',
 'german_shepherd',
 'german_short-haired_pointer',
 'giant_schnauzer',
 'golden_retriever',
 'gordon_setter',
 'great_dane',
 'great_pyrenees',
 'greater_swiss_mountain_dog',
 'groenendael',


In [None]:
# see what the probabilities look like
probs[:,0:]

array([[0.00002, 0.00001, 0.00251, ..., 0.00005, 0.00005, 0.00001],
       [0.00039, 0.00131, 0.00001, ..., 0.01253, 0.00017, 0.00002],
       [0.00008, 0.00018, 0.     , ..., 0.00002, 0.00004, 0.00003],
       ...,
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.00014, 0.     , ..., 0.00028, 0.00055, 0.00001],
       [0.00025, 0.00002, 0.05882, ..., 0.00086, 0.83517, 0.00008]], dtype=float32)

We create a Pandas DataFrame from the dog probabilities. We name the columns after each dog class as specified in the Kaggle sample submission CSV file.

In [None]:
df = pd.DataFrame(probs[:,0:])
df.columns = data.classes

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
(10357, 120)
   affenpinscher  afghan_hound  african_hunting_dog  airedale  \
0       0.000013      0.000020             0.000006  0.002514   
1       0.000003      0.000390             0.001307  0.000009   
2       0.000118      0.000081             0.000184  0.000004   
3       0.000001      0.000048             0.000031  0.000028   
4       0.001611      0.001757             0.000010  0.000013   

   american_staffordshire_terrier   appenzeller  australian_terrier   basenji  \
0                    4.386695e-05  3.392660e-05            0.000005  0.000008   
1                    8.680340e-03  5.044078e-05            0.000039  0.000167   
2                    9.590101e-06  6.962547e-05            0.000008  0.000009   
3                    5.549327e-07  7.073843e-07            0.000003  0.000001   
4                    2.436390e-05  7.664322e-06            0.001042  0.000005   

         basset        beagle        ...       

We want a new first column named *id* that has the ID extracted from each test image's filename.

In [None]:
df.insert(0, 'id', [o[5:-4] for o in data.test_ds.fnames])

In [None]:
# check the output
df.head()

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,a9a33f0585e5af26dffd6dc0a00ba88f,1.3e-05,2e-05,6e-06,0.002514,4.386695e-05,3.39266e-05,5e-06,8e-06,2.513368e-07,...,9.040908e-07,4.101499e-07,0.000105,3.339394e-06,5.4e-05,1e-06,3.372458e-06,5.4e-05,5.188238e-05,6e-06
1,2e1d705a46d57f75f2572a9e23b162d6,3e-06,0.00039,0.001307,9e-06,0.00868034,5.044078e-05,3.9e-05,0.000167,0.0002605283,...,1.864011e-05,0.001740458,0.000198,0.0009494134,0.041927,0.000301,0.003541379,0.012531,0.0001665042,2.2e-05
2,332eff8d1e8f55a0e7335a1552f681ef,0.000118,8.1e-05,0.000184,4e-06,9.590101e-06,6.962547e-05,8e-06,9e-06,0.002177278,...,3.12703e-05,1.126626e-05,0.000249,0.003660477,0.001071,0.000128,2.316688e-05,1.8e-05,3.881233e-05,2.9e-05
3,74b133bbc2d4ece626394bdec13d702a,1e-06,4.8e-05,3.1e-05,2.8e-05,5.549327e-07,7.073843e-07,3e-06,1e-06,9.66099e-07,...,1.577144e-06,1.959567e-06,0.000298,3.023067e-07,2.5e-05,0.000351,2.569704e-07,3e-06,8.560021e-07,4e-06
4,b7e6c5746e5fc25b72ef66a61bee779a,0.001611,0.001757,1e-05,1.3e-05,2.43639e-05,7.664322e-06,0.001042,5e-06,3.4673e-05,...,0.0003210986,2.747978e-05,3.1e-05,6.789387e-05,4.4e-05,9e-06,1.268868e-05,4.5e-05,3.465577e-05,0.00677


We write into the *out* directory the dogscats_simple.zip CSV file.

In [None]:
SUBM = f'../../out/'
os.makedirs(SUBM, exist_ok=True)
df.to_csv(f'{SUBM}dogBreedIdentification_simple.gz', compression='gzip', index=False)

![Submission](kaggle_submission.png)