In [34]:
import os
import pandas as pd
import numpy as np

# Using Skicit-learn to split data into training and testing sets 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix



In [2]:
train_fp = os.path.join(os.getcwd(),'aux_data','trainset.csv')
features_df = pd.read_csv(train_fp)
features_df.head(4)

Unnamed: 0.1,Unnamed: 0,geometry,x,y,r,g,b,nifr,iceplant
0,0,POINT (238534.77144037225 3810764.4391108337),238534.77144,3810764.0,89,84,89,144,1
1,1,POINT (238516.67475820333 3810768.371394962),238516.674758,3810768.0,100,88,93,148,1
2,2,POINT (238496.0413784355 3810771.828721741),238496.041378,3810772.0,97,93,96,140,1
3,3,POINT (238497.93042821612 3810771.2369597284),238497.930428,3810771.0,87,80,86,137,1


In [3]:
features_df.drop(['geometry','Unnamed: 0','x','y'],axis=1,inplace=True)

In [4]:
# Descriptive statistics for each column 
features_df.describe()

Unnamed: 0,r,g,b,nifr,iceplant
count,224.0,224.0,224.0,224.0,224.0
mean,84.09375,91.1875,88.955357,146.495536,0.330357
std,32.576714,28.561485,21.804014,36.055015,0.471395
min,29.0,42.0,59.0,40.0,0.0
25%,64.0,76.0,75.0,126.0,0.0
50%,81.0,81.5,83.5,148.0,0.0
75%,91.25,100.0,96.0,170.0,1.0
max,196.0,192.0,180.0,213.0,1.0


In [26]:
# Labels are the values we want to predict
labels = np.array(features_df['iceplant'])

# Remove the labels from the features
features= features_df.drop('iceplant', axis = 1)
features_names = list(features.columns)

#Convert to numpy array
features = np.array(features)

In [27]:
features_names

['r', 'g', 'b', 'nifr']

In [6]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                                            labels, 
                                                                            test_size = 0.3, 
                                                                            random_state = 42)

In [7]:
print('Training Features Shape:', train_features.shape) 
print('Training Labels Shape:', train_labels.shape) 
print('Testing Features Shape:', test_features.shape) 
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (156, 4)
Training Labels Shape: (156,)
Testing Features Shape: (68, 4)
Testing Labels Shape: (68,)


In [9]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, 
                           random_state = 42)
# Train the model on training data 
rf.fit(train_features, train_labels) 

RandomForestRegressor(random_state=42)

In [10]:
# Use the forest's predict method on the test data 
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 0.09


In [11]:
# Calculate mean absolute percentage error (MAPE) 
mape = 100 * ( (errors) / (test_labels))
# Calculate and display accuracy
accuracy = 100 - np.mean(mape) 
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: nan %.


  mape = 100 * ( (errors) / (test_labels))
  mape = 100 * ( (errors) / (test_labels))


In [12]:
predictions

array([1.  , 0.  , 0.34, 1.  , 0.91, 0.05, 0.  , 0.  , 0.  , 0.68, 0.02,
       1.  , 0.01, 0.06, 0.  , 0.  , 0.  , 0.  , 0.33, 0.01, 0.06, 1.  ,
       0.  , 1.  , 1.  , 0.96, 0.  , 0.  , 0.  , 0.81, 0.95, 0.  , 1.  ,
       0.02, 0.95, 0.  , 0.5 , 0.03, 0.  , 0.21, 0.03, 1.  , 0.03, 0.47,
       0.91, 0.  , 0.01, 0.  , 0.  , 0.07, 0.  , 0.32, 0.56, 0.13, 0.05,
       1.  , 0.38, 0.  , 0.  , 0.95, 0.  , 0.99, 0.15, 1.  , 0.  , 0.99,
       1.  , 0.  ])

In [13]:
errors

array([0.  , 0.  , 0.34, 0.  , 0.09, 0.05, 0.  , 0.  , 0.  , 0.68, 0.02,
       0.  , 0.01, 0.06, 0.  , 0.  , 0.  , 0.  , 0.67, 0.01, 0.06, 0.  ,
       0.  , 0.  , 0.  , 0.04, 0.  , 0.  , 0.  , 0.81, 0.05, 0.  , 0.  ,
       0.02, 0.05, 0.  , 0.5 , 0.03, 0.  , 0.21, 0.03, 0.  , 0.03, 0.47,
       0.09, 0.  , 0.01, 0.  , 0.  , 0.07, 0.  , 0.68, 0.56, 0.13, 0.05,
       0.  , 0.38, 0.  , 0.  , 0.05, 0.  , 0.01, 0.15, 0.  , 0.  , 0.01,
       0.  , 0.  ])

In [31]:
test = pd.DataFrame(test_features, columns=features_names)
test['iceplant']=test_labels
test

Unnamed: 0,r,g,b,nifr,iceplant
0,82,77,82,144,1
1,142,129,130,145,0
2,71,93,75,164,0
3,82,80,82,151,1
4,84,81,88,133,1
...,...,...,...,...,...
63,75,74,72,167,1
64,31,47,59,113,0
65,86,77,77,161,1
66,78,72,79,144,1


In [52]:
thresh = 0.6
test['predictions']= predictions
test['classified_as'] = test.predictions.apply(lambda x : 1 if x >thresh else 0)
test

Unnamed: 0,r,g,b,nifr,iceplant,predictions,classified_as
0,82,77,82,144,1,1.00,1
1,142,129,130,145,0,0.00,0
2,71,93,75,164,0,0.34,0
3,82,80,82,151,1,1.00,1
4,84,81,88,133,1,0.91,1
...,...,...,...,...,...,...,...
63,75,74,72,167,1,1.00,1
64,31,47,59,113,0,0.00,0
65,86,77,77,161,1,0.99,1
66,78,72,79,144,1,1.00,1


In [53]:
N = test.shape[0]
results = confusion_matrix(test['iceplant'],test['classified_as'])

print('true negatives %', np.round(results[0,0]/N*100,2), 
      '    false positives %', np.round(results[0,1]/N*100,2))
print('false negatives %', np.round(results[1,0]/N*100,2), '    true positives %', np.round(results[1,1]/N*100,2))

true negatives % 64.71     false positives % 2.94
false negatives % 4.41     true positives % 27.94
