# Camera Trap to Random Forests in Python

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime as dt

### CSV

Bring in the CSV data. The data is also available in JSON, which may actually be the most intuitive and efficient structure to unpack from in the long run.  

In [2]:
csv_df = pd.read_csv('snowcam_training.csv')

In [3]:
csv_df.head()

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes
0,invert_Hbwtr_w1_20180818_120445.JPG,177162,"{""attribute"":{}}",21,0,"{""name"":""rect"",""x"":1665,""y"":1571,""width"":119,""...","{""attribute"":{""leaf_green"":true}}"
1,invert_Hbwtr_w1_20180818_120445.JPG,177162,"{""attribute"":{}}",21,1,"{""name"":""rect"",""x"":1925,""y"":1532,""width"":84,""h...","{""attribute"":{""leaf_green"":true}}"
2,invert_Hbwtr_w1_20180818_120445.JPG,177162,"{""attribute"":{}}",21,2,"{""name"":""rect"",""x"":2252,""y"":1504,""width"":88,""h...","{""attribute"":{""leaf_green"":true}}"
3,invert_Hbwtr_w1_20180818_120445.JPG,177162,"{""attribute"":{}}",21,3,"{""name"":""rect"",""x"":2157,""y"":1486,""width"":88,""h...","{""attribute"":{""leaf_sub"":true}}"
4,invert_Hbwtr_w1_20180818_120445.JPG,177162,"{""attribute"":{}}",21,4,"{""name"":""rect"",""x"":1686,""y"":1433,""width"":49,""h...","{""attribute"":{""leaf_green"":true}}"


<hr>

### Wrangle the Data

unpack 'region_attributes' column data to produce **class** variable

In [4]:
# image_class is a list- built to accept the
# modified 'region_attribute' data  
image_class = []

# created by the for loop below
for i in range(len(csv_df['region_attributes'])):
    
    # convert the strings of json data into python dictionaries
    json_item = json.loads(csv_df['region_attributes'][i])
    
    # if data is not empty dictionary
    if json_item:
        
        # extract the name of the key, as a string in a list
        image_type = list(json_item['attribute'].keys())
        
        # if dictionary entry is not empty list
        if image_type:
            image_type = image_type[0]

        # leave None for empty values
        else:
            image_type = None
    else:
        image_type = None
        
    # add value to image_class
    image_class.append(image_type)

# make image_class a column of our dataframe
csv_df['class'] = image_class

unpack 'region_shape' column data to produce **name** , **x** coordinate, **y** coordinate, **width** and **height** data

In [5]:
# unpack 'region_shape_attributes' into string of desired data
# leaving 'None' for empty values

# create dictionary to accept region shape data
region_shapes = {
    'name': [],
    'x': [],
    'y': [],
    'width': [],
    'height': [],
    }

# loop thru region attribute data
for i in range(len(csv_df['region_shape_attributes'])):
    # convert json strings to python dictionaries
    json_item = json.loads(csv_df['region_shape_attributes'][i])
    
    # loop thru the dictionary keys
    for attr in region_shapes.keys():
        # if JSON item contains data
        if json_item:
            # pull data item
            region_attr = json_item[attr]
            # append to that attribute's list in dictionary
            region_shapes[attr].append(region_attr)
        else:
            # otherwise, append None to appropriate list
            region_shapes[attr].append(None)
    
# create columns of this data in dataframe
for variable in region_shapes.keys():
    csv_df[variable] = region_shapes[variable]

extract data frome the file name to produce **system**, **watershed**, **date**, and **pic_id** data

In [6]:
# dictionary of filename data
filename_data = {
    'system': [],
    'watershed': [],
    'date': [],
    'pic_id': [],
}

# loop through filename string
for i in range(len(csv_df['filename'])):
    # split contents by underscore
    file_items = csv_df['filename'][i].split('_')

    # add system column for 'hbwtr'
    filename_data['system'].append(file_items[1])
    # add watershed olumn, 'w1', 'w2', etc.
    filename_data['watershed'].append(file_items[2])
    # add date column
    filename_data['date'].append(
                                # modify integer date to date format, MM/DD/YYYY
                                dt.strptime(file_items[3], '%Y%m%d').strftime('%m/%d/%Y')
                                )
    # add picture id number
    filename_data['pic_id'].append(file_items[4])

# create columns of this data in dataframe
for variable in filename_data.keys():
    csv_df[variable] = filename_data[variable]

adding calculated variables, for now, **area** and **aspect ratio**

In [7]:
# create polygon_area column
calculations = {
    'area': [],
    'aspect_ratio': []
}

# loop through dataframe
for i in range(len(csv_df['filename'])):
    # params
    height = csv_df['height'][i]
    width = csv_df['width'][i]
    
    # calcs
    area =  height * width
    aspect_ratio = height/width
    
    calculations['area'].append(area)
    calculations['aspect_ratio'].append(aspect_ratio)
    
# create columns of this data in dataframe
for variable in calculations.keys():
    csv_df[variable] = calculations[variable]

In [8]:
# make list to reorder columns
cols = csv_df.columns.tolist()
new_cols = cols[:1] + cols[13:16] + cols[7:13] + cols[16:17] + cols[1:7] + cols[17:]

csv_df = csv_df[new_cols]

<div class="alert alert-info" role="alert">
  This dataframe now contains all original data, and many variables extracted or calculated from the original data contents. The rest of this notebook will manipulate this dataset further to feed it to a random forest model. Before this, the three cells directly below will contain commented code which will download three different versions of the CSV.
    <ol>
        <li> the full data, original and wrangled </li>
        <li> a slimmed dataset, containing all the original information, but with reducded redundancy</li>
        <li> a dataset removed of all categorical variables for easy use in the random forest model </li>
    </ol>
</div>

In [9]:
csv_df

Unnamed: 0,filename,system,watershed,date,class,name,x,y,width,height,pic_id,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes,area,aspect_ratio
0,invert_Hbwtr_w1_20180818_120445.JPG,Hbwtr,w1,08/18/2018,leaf_green,rect,1665.0,1571.0,119.0,46.0,120445.JPG,177162,"{""attribute"":{}}",21,0,"{""name"":""rect"",""x"":1665,""y"":1571,""width"":119,""...","{""attribute"":{""leaf_green"":true}}",5474.0,0.386555
1,invert_Hbwtr_w1_20180818_120445.JPG,Hbwtr,w1,08/18/2018,leaf_green,rect,1925.0,1532.0,84.0,63.0,120445.JPG,177162,"{""attribute"":{}}",21,1,"{""name"":""rect"",""x"":1925,""y"":1532,""width"":84,""h...","{""attribute"":{""leaf_green"":true}}",5292.0,0.750000
2,invert_Hbwtr_w1_20180818_120445.JPG,Hbwtr,w1,08/18/2018,leaf_green,rect,2252.0,1504.0,88.0,46.0,120445.JPG,177162,"{""attribute"":{}}",21,2,"{""name"":""rect"",""x"":2252,""y"":1504,""width"":88,""h...","{""attribute"":{""leaf_green"":true}}",4048.0,0.522727
3,invert_Hbwtr_w1_20180818_120445.JPG,Hbwtr,w1,08/18/2018,leaf_sub,rect,2157.0,1486.0,88.0,35.0,120445.JPG,177162,"{""attribute"":{}}",21,3,"{""name"":""rect"",""x"":2157,""y"":1486,""width"":88,""h...","{""attribute"":{""leaf_sub"":true}}",3080.0,0.397727
4,invert_Hbwtr_w1_20180818_120445.JPG,Hbwtr,w1,08/18/2018,leaf_green,rect,1686.0,1433.0,49.0,21.0,120445.JPG,177162,"{""attribute"":{}}",21,4,"{""name"":""rect"",""x"":1686,""y"":1433,""width"":49,""h...","{""attribute"":{""leaf_green"":true}}",1029.0,0.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2325,invert_Hbwtr_w1_20181227_115803.JPG,Hbwtr,w1,12/27/2018,snow_o,rect,1778.0,1409.0,232.0,39.0,115803.JPG,146672,"{""attribute"":{}}",6,5,"{""name"":""rect"",""x"":1778,""y"":1409,""width"":232,""...","{""attribute"":{""snow_o"":true}}",9048.0,0.168103
2326,invert_Hbwtr_w1_20181228_120503.JPG,Hbwtr,w1,12/28/2018,,,,,,,120503.JPG,143378,"{""attribute"":{}}",0,0,{},{},,
2327,invert_Hbwtr_w1_20181229_120502.JPG,Hbwtr,w1,12/29/2018,,,,,,,120502.JPG,144076,"{""attribute"":{}}",0,0,{},{},,
2328,invert_Hbwtr_w1_20181230_120001.JPG,Hbwtr,w1,12/30/2018,,,,,,,120001.JPG,148215,"{""attribute"":{}}",0,0,{},{},,


In [10]:
# 1: Download Full Dataframe
csv_df.to_csv('camera_data_raw.csv')

In [11]:
# 2: Curated Dataframe (recommended)
csv_df = csv_df.drop(columns=['file_size', 'file_attributes', 'region_shape_attributes', 'region_attributes'])
csv_df.to_csv('camera_data.csv')

In [12]:
# 3: Random Forest prep dataframe
# drop all categorical data (except class, for now)
csv_df = csv_df.drop(
                columns=['filename', 'system', 'watershed', 
                         'date', 'name', 'pic_id',]
                )
# drop all None values
csv_df = csv_df.fillna(value=np.nan)
csv_df = csv_df.dropna()

<hr>

## Train Test Split

split up the data into training set and test set

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = csv_df.drop('class',axis=1)
y = csv_df['class']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
rfc_pred = rfc.predict(X_test)

In [20]:
print(confusion_matrix(y_test,rfc_pred))

[[  5   1   0   0   0   0   5   1   0   0   0   1   3   0]
 [  1   2   0   0   0   0   2   3   0   0   0   2   2   0]
 [  0   0 106   5  31   2   4   1   0   0   0   4   1   0]
 [  0   0  17  27  19   0   0   0   0   1   0   0   0   0]
 [  0   0  33  12  79   1   7   0   0   0   1   0   0   0]
 [  0   0   1   0   2  13   8   0   0   0   0   4   0   0]
 [  0   1   4   0   7   1 120   7   0   0   0   5   2   0]
 [  0   0   0   0   0   0  12  14   0   0   0   1   0   0]
 [  0   0   0   0   0   0   9   0   0   0   0   0   1   0]
 [  0   0   0   0   1   1   0   0   0   7   0   0   0   0]
 [  0   0   3   1   2   1   2   0   0   0   1   3   1   0]
 [  0   0   2   0   2   2   0   0   1   0   0  38   6   0]
 [  0   0   1   0   1   0   1   0   0   0   0   8  18   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   2   0]]


In [21]:
print(classification_report(y_test,rfc_pred))

                  precision    recall  f1-score   support

           ice_o       0.83      0.31      0.45        16
           ice_t       0.50      0.17      0.25        12
       leaf_fall       0.63      0.69      0.66       154
      leaf_green       0.60      0.42      0.50        64
        leaf_sub       0.55      0.59      0.57       133
  not_submerged        0.62      0.46      0.53        28
      open_water       0.71      0.82      0.76       147
 open_water_dark       0.54      0.52      0.53        27
open_water_green       0.00      0.00      0.00        10
           other       0.88      0.78      0.82         9
          riffle       0.50      0.07      0.12        14
            rock       0.58      0.75      0.65        51
          snow_o       0.50      0.62      0.55        29
          snow_t       0.00      0.00      0.00         2

        accuracy                           0.62       696
       macro avg       0.53      0.44      0.46       696
    weighted

  _warn_prf(average, modifier, msg_start, len(result))
