In [15]:
import os
import pandas as pd
import numpy as np

# Using Skicit-learn to split data into training and testing sets 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
train_fp = os.path.join(os.getcwd(),'aux_data','trainset.csv')
features = pd.read_csv(train_fp)
features.head(4)

Unnamed: 0.1,Unnamed: 0,geometry,x,y,r,g,b,nifr,iceplant
0,0,POINT (238534.77144037225 3810764.4391108337),238534.77144,3810764.0,89,84,89,144,1
1,1,POINT (238516.67475820333 3810768.371394962),238516.674758,3810768.0,100,88,93,148,1
2,2,POINT (238496.0413784355 3810771.828721741),238496.041378,3810772.0,97,93,96,140,1
3,3,POINT (238497.93042821612 3810771.2369597284),238497.930428,3810771.0,87,80,86,137,1


In [4]:
features.drop(['geometry','Unnamed: 0'],axis=1,inplace=True)

In [3]:
# Descriptive statistics for each column 
features.describe()

Unnamed: 0.1,Unnamed: 0,x,y,r,g,b,nifr,iceplant
count,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0
mean,111.5,238829.471487,3811296.0,84.09375,91.1875,88.955357,146.495536,0.330357
std,64.807407,439.264247,488.0251,32.576714,28.561485,21.804014,36.055015,0.471395
min,0.0,238336.841599,3810763.0,29.0,42.0,59.0,40.0,0.0
25%,55.75,238517.19807,3810854.0,64.0,76.0,75.0,126.0,0.0
50%,111.5,238615.666834,3811142.0,81.0,81.5,83.5,148.0,0.0
75%,167.25,239273.141856,3811881.0,91.25,100.0,96.0,170.0,1.0
max,223.0,239970.680835,3812162.0,196.0,192.0,180.0,213.0,1.0


In [10]:
# Labels are the values we want to predict
labels = np.array(features['iceplant'])

# Remove the labels from the features
features= features.drop('iceplant', axis = 1)
features_names = list(features.columns)

#Convert to numpy array
features = np.array(features)

array([[2.38534771e+05, 3.81076444e+06, 8.90000000e+01, 8.40000000e+01,
        8.90000000e+01, 1.44000000e+02],
       [2.38516675e+05, 3.81076837e+06, 1.00000000e+02, 8.80000000e+01,
        9.30000000e+01, 1.48000000e+02],
       [2.38496041e+05, 3.81077183e+06, 9.70000000e+01, 9.30000000e+01,
        9.60000000e+01, 1.40000000e+02],
       ...,
       [2.38340222e+05, 3.81085501e+06, 8.40000000e+01, 8.10000000e+01,
        8.80000000e+01, 1.33000000e+02],
       [2.39084641e+05, 3.81179102e+06, 1.07000000e+02, 9.70000000e+01,
        9.40000000e+01, 1.57000000e+02],
       [2.38527326e+05, 3.81082981e+06, 9.30000000e+01, 8.30000000e+01,
        8.40000000e+01, 1.47000000e+02]])

In [13]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                                            labels, 
                                                                            test_size = 0.3, 
                                                                            random_state = 42)

In [14]:
print('Training Features Shape:', train_features.shape) 
print('Training Labels Shape:', train_labels.shape) 
print('Testing Features Shape:', test_features.shape) 
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (156, 6)
Training Labels Shape: (156,)
Testing Features Shape: (68, 6)
Testing Labels Shape: (68,)


In [16]:
## ----- COME UP WITH SOME BASELINE ERROR ------

In [16]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data 
rf.fit(train_features, train_labels) 

RandomForestRegressor(n_estimators=1000, random_state=42)

In [25]:
# Use the forest's predict method on the test data 
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 0.07


In [21]:
# Calculate mean absolute percentage error (MAPE) 
mape = 100 * ( (errors) / (test_labels))
# Calculate and display accuracy
accuracy = 100 - np.mean(mape) 
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.31 %.


In [23]:
predictions

array([1.   , 0.   , 0.43 , 1.   , 0.948, 0.017, 0.   , 0.   , 0.   ,
       0.33 , 0.001, 1.   , 0.002, 0.012, 0.   , 0.   , 0.   , 0.   ,
       0.481, 0.   , 0.003, 1.   , 0.046, 1.   , 1.   , 0.998, 0.001,
       0.   , 0.   , 0.109, 0.964, 0.   , 0.998, 0.05 , 0.997, 0.21 ,
       0.961, 0.294, 0.008, 0.187, 0.001, 0.991, 0.049, 0.828, 0.996,
       0.   , 0.001, 0.008, 0.   , 0.1  , 0.   , 0.507, 0.082, 0.366,
       0.003, 1.   , 0.051, 0.003, 0.   , 0.991, 0.   , 0.999, 0.26 ,
       0.978, 0.124, 0.996, 0.999, 0.001])

In [24]:
errors

array([0.   , 0.   , 0.43 , 0.   , 0.052, 0.017, 0.   , 0.   , 0.   ,
       0.33 , 0.001, 0.   , 0.002, 0.012, 0.   , 0.   , 0.   , 0.   ,
       0.519, 0.   , 0.003, 0.   , 0.046, 0.   , 0.   , 0.002, 0.001,
       0.   , 0.   , 0.109, 0.036, 0.   , 0.002, 0.05 , 0.003, 0.21 ,
       0.039, 0.294, 0.008, 0.187, 0.001, 0.009, 0.049, 0.828, 0.004,
       0.   , 0.001, 0.008, 0.   , 0.1  , 0.   , 0.493, 0.082, 0.366,
       0.003, 0.   , 0.051, 0.003, 0.   , 0.009, 0.   , 0.001, 0.26 ,
       0.022, 0.124, 0.004, 0.001, 0.001])