# Data Exploration

In this file, we will explore the image data from Kaggle competition [State Farm Distracted Driver Detection](https://www.kaggle.com/c/state-farm-distracted-driver-detection/data). 

The dataset contains three files
- `imgs.zip`: zipped folder of all (train/test) images
- `sample_submission.csv`: a sample submission file in the correct format
- `driver_imgs_list.csv`: a list of training images, their subject (driver) id, and class id

We first unzip the file `imgs.zip` to obtain the raw image data.

In [1]:
import zipfile
import os
from tqdm import tqdm

with zipfile.ZipFile('imgs.zip', 'r') as f:
    if not os.path.exists('train') or not os.path.exists('test'):
        for i in tqdm(range(len(f.infolist()))):
            f.extract(f.infolist()[i])
    else:
        print('The dataset has been unzipped.')

100%|██████████| 102162/102162 [10:26<00:00, 163.17it/s]


We can use `driver_imgs_list.csv` to check if all files is unzipped. 

In [23]:
import pandas as pd
import numpy as np

imgs_list = pd.read_csv('driver_imgs_list.csv')
paths_list = []
miss_files = []
for i in range(len(imgs_list)):
    path = 'train/'+imgs_list.iloc[i]['classname']+'/'+imgs_list.iloc[i]['img']
    paths_list.append(path)
    if not os.path.exists(path):
        miss_files.append(path)
if len(miss_files) == 0:
    print('No file is missing. There are {} training images.'.format(len(imgs_list)))
else:
    print('The following files are missing:\n')
    print(miss_files)

No file is missing. There are 22424 training images.


In [29]:
def get_paths(root):
    paths = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            paths.append(os.path.join(path, name))
    return paths

In [30]:
test_paths = get_paths('test')
print('There are {} images in the test set.'.format(len(test_paths)))

There are 79726 images in the test set.


In [22]:
if len(imgs_list) == len(paths):
    print('There are {} training images.'.format(len(imgs_list)))

There are 22424 training images.


In [25]:
subj = np.unique(imgs_list['subject'])
print('There are {} drivers.'.format(len(subj)))
print(subj)

There are 26 drivers.
['p002' 'p012' 'p014' 'p015' 'p016' 'p021' 'p022' 'p024' 'p026' 'p035'
 'p039' 'p041' 'p042' 'p045' 'p047' 'p049' 'p050' 'p051' 'p052' 'p056'
 'p061' 'p064' 'p066' 'p072' 'p075' 'p081']


In [36]:
subj_count = {}
for sub in subj:
    subj_count[sub] = len(imgs_list[imgs_list['subject']==sub])

print(subj_count)

{'p012': 823, 'p026': 1196, 'p052': 740, 'p081': 823, 'p049': 1011, 'p047': 835, 'p072': 346, 'p016': 1078, 'p045': 724, 'p021': 1237, 'p064': 820, 'p051': 920, 'p066': 1034, 'p056': 794, 'p002': 725, 'p024': 1226, 'p014': 876, 'p022': 1233, 'p042': 591, 'p075': 814, 'p039': 651, 'p041': 605, 'p050': 790, 'p015': 875, 'p061': 809, 'p035': 848}


In [31]:
cls_name = np.unique(imgs_list['classname'])
print('There are {} classes.'.format(len(cls_name)))
print(cls_name)

There are 10 classes.
['c0' 'c1' 'c2' 'c3' 'c4' 'c5' 'c6' 'c7' 'c8' 'c9']


In [35]:
cls_count = {}
for cls in cls_name:
    cls_count[cls] = len(imgs_list[imgs_list['classname']==cls])

print(cls_count)

{'c0': 2489, 'c5': 2312, 'c7': 2002, 'c3': 2346, 'c1': 2267, 'c8': 1911, 'c4': 2326, 'c6': 2325, 'c9': 2129, 'c2': 2317}


In [5]:
import matplotlib.pyplot as plt


Unnamed: 0,subject,classname,img
0,p002,c0,img_44733.jpg
1,p002,c0,img_72999.jpg
2,p002,c0,img_25094.jpg
3,p002,c0,img_69092.jpg
4,p002,c0,img_92629.jpg


In [6]:
imgs_list['id'] = range(imgs_list.shape[0])
imgs_list.head()

Unnamed: 0,subject,classname,img,id
0,p002,c0,img_44733.jpg,0
1,p002,c0,img_72999.jpg,1
2,p002,c0,img_25094.jpg,2
3,p002,c0,img_69092.jpg,3
4,p002,c0,img_92629.jpg,4


In [7]:
subj = np.unique(imgs_list['subject'])
subj

array(['p002', 'p012', 'p014', 'p015', 'p016', 'p021', 'p022', 'p024',
       'p026', 'p035', 'p039', 'p041', 'p042', 'p045', 'p047', 'p049',
       'p050', 'p051', 'p052', 'p056', 'p061', 'p064', 'p066', 'p072',
       'p075', 'p081'], dtype=object)

In [8]:
cls = np.unique(imgs_list['classname'])
cls

array(['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'], dtype=object)

In [9]:
len(imgs_list)

22424

In [12]:
imgs_list.iloc[0]['subject']

'p002'

In [13]:
paths_list = []
for i in range(len(imgs_list)):
    paths_list.append('train/'+imgs_list.iloc[0]['classname']+'/'+imgs_list.iloc[0]['img'])
paths_list[0]

'train/c0/img_44733.jpg'

In [14]:
for path in paths_list:
    if not os.path.exists(path):
        print(path)

In [15]:
paths = os.listdir('train')
paths

['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']

In [17]:
paths = []
for path, subdirs, files in os.walk('train'):
    for name in files:
        paths.append(os.path.join(path, name))
len(paths)

22424