<a href="https://colab.research.google.com/github/MudSnail/Land_Cover_Classification/blob/main/Basic_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#import base libraries
import pandas as pd
import numpy as np
import copy
import pickle
import cloudpickle

#Sklearn - Pipelines, Training
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

#Classification Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#Model metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#misc
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Input Data, Combine and Select

In [3]:
#Read in CSVs
james_bay = pd.read_csv('/content/drive/MyDrive/james_bay_data.csv')
labrador = pd.read_csv('/content/drive/MyDrive/labrador_data.csv')
simcoe = pd.read_csv('/content/drive/MyDrive/simcoe_york_data.csv')

In [4]:
#List dataframes to add to james_bay
dfs = [labrador, simcoe]

#Merged data
merged_data = james_bay.append(dfs)

#Print shape and check new data
print(merged_data.shape)
merged_data.head()

(8917500, 13)


Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.0055,0.0017,0.0009,0.0016,0.0026,0.0056,0.0001,0.0009,0.0003,0.0032,0.0012,0.0114,0
1,0.0054,0.002,0.0009,0.0017,0.0021,0.006,0.0001,0.0005,0.0007,0.0028,0.0014,0.0115,0
2,0.0058,0.0018,0.0007,0.0016,0.0028,0.006,0.0001,0.0008,0.0007,0.003,0.0012,0.0124,0
3,0.0057,0.0016,0.0007,0.0016,0.0023,0.0058,0.0001,0.0012,0.0006,0.0031,0.0011,0.0119,0
4,0.0066,0.0013,0.0008,0.002,0.0029,0.0059,0.0001,0.0008,0.0005,0.003,0.0014,0.0115,0


In [5]:
#Check classifications values
print(merged_data.Class.value_counts())

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
0         961
Name: Class, dtype: int64


In [6]:
#Drop Class = 0 (null values, which equate to 18)
merged_data = merged_data[merged_data.Class != 0]

#Double check value counts
merged_data.Class.value_counts()

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
Name: Class, dtype: int64

In [7]:
#SubSample ~25 000 for each class
sample = merged_data.groupby('Class').apply(lambda x: x.sample(n=25000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.0161,0.1176,0.1007,0.1317,0.0671,0.0348,0.1473,0.0584,0.1435,0.0304,0.1356,0.0247,1
1,0.0153,0.1464,0.139,0.1667,0.0786,0.0385,0.1898,0.0769,0.1862,0.0343,0.1761,0.024,1
2,0.0104,0.1376,0.0951,0.1566,0.0635,0.0334,0.1681,0.0486,0.1705,0.021,0.1623,0.0171,1
3,0.013,0.0377,0.0287,0.0409,0.0235,0.0164,0.0806,0.0174,0.0456,0.0153,0.0429,0.0101,1


In [8]:
#Check basic stats
sample.describe()

Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
count,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0,375000.0
mean,0.065909,0.183643,0.182858,0.201788,0.128877,0.088892,0.229219,0.122685,0.222897,0.091266,0.220215,0.072123,11.133333
std,0.131255,0.124614,0.098111,0.122319,0.131217,0.132804,0.126683,0.089086,0.121731,0.134426,0.127393,0.135234,5.512014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0156,0.1348,0.1269,0.1508,0.0786,0.0392,0.1776,0.0674,0.1705,0.0334,0.1646,0.02,6.0
50%,0.0257,0.1668,0.1759,0.1876,0.1029,0.0527,0.2182,0.1034,0.2131,0.0553,0.2062,0.0339,12.0
75%,0.0514,0.205,0.2344,0.2315,0.1294,0.0768,0.2621,0.1597,0.2616,0.088,0.2576,0.0579,16.0
max,1.7526,1.8148,1.6362,1.8313,1.8079,1.8731,1.6274,1.6317,1.8302,1.8301,1.8213,1.9337,19.0


# Split data

In [9]:
#Set x and y
y = sample['Class']
X = sample.drop('Class', axis = 1)

#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [10]:
#check shape
X_train.shape, y_train.shape

((262500, 12), (262500,))

# Create Pipelines

In [11]:
#Functions for Pipeline
def add_layers(df):
  """
  This function takes in a dataframe and calculates the NDVI, Moisture Index, NDWI and NDSI
  Outputs = dataframe with added layer columns 
  """
  #Create NDVI column (B08-B04)/(B08+B04)
  df['NDVI'] = (df.B08 - df.B04)/(df.B08 + df.B04)
  #Create Moisture index (B8A-B11)/(B8A+B11)
  df['Moisture'] = (df.B8A - df.B11)/(df.B8A + df.B11)
  #Create NDWI (B3-B8)/(B3+B8)
  df['NDWI'] = (df.B03 - df.B08)/(df.B03 + df.B08)
  #create NDSI (B3-B11)/(B3+B11)
  df['NDSI'] = (df.B03 - df.B11)/(df.B03 + df.B11)

  return df

def replace_values(df):
  """
  This function replaces the infinity values with Nan then replaces that with new infinity values
  """
  #Replace infinity values with Nan
  df.replace([np.inf, -np.inf], np.NAN, inplace=True)

  #Fill in null values
  df.fillna(999, inplace=True)

  return df


#Define object transformer class
class objectTransformer:
    """
    Class object transformer, takes in function to apply in preprocessing step of Pipeline
    """
    def __init__(self, func):
        self.func = func
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **fit_params):
        return self.func(X)

## Test PCA and Feature Union

In [13]:
#Create Pre-Processing Pipeline
preprocess = Pipeline([("add_layers", objectTransformer(add_layers)),
                       ("replace_values", objectTransformer(replace_values))])

#Create Feature Union
feature_union = FeatureUnion([("PCA", PCA(n_components=6)),
                              ("kKBest", SelectKBest(k=2))])


In [None]:
#Define pipelines
model_1 = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

model_2 = Pipeline([('preprocessing', preprocess),
                    ('features', feature_union),
                    ('RFC', RandomForestClassifier(random_state = 123))])

model_3 = Pipeline([('preprocessing', preprocess),
                    ('pca', PCA(n_components=6)),
                    ('RFC', RandomForestClassifier(random_state = 123))])

In [None]:
#fit pipes to check which is best
pipes = [model_1, model_2, model_3]

for pipe in pipes:
  pipe.fit(X_train, y_train)

#compare accuracies 
#make dictionary
pipe_dict = {0: 'No Feature Union', 1: 'With Feature Union', 2: 'Only PCA'}

#create for loop to find scores
for i, model in enumerate(pipes):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

No Feature Union Test Accuracy: 0.6150577777777778
With Feature Union Test Accuracy: 0.5676888888888889
Only PCA Test Accuracy: 0.5371822222222222


## Compare SVC, Random Forest, XGBoost

In [13]:
#Compare three model types
svc_pipe = Pipeline([('preprocessing', preprocess),
                    ('SVC', SVC(random_state = 123))])

rfc_pipe = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

xgb_pipe = Pipeline([('preprocessing', preprocess),
                    ('XGB', XGBClassifier(random_state = 123))])

In [14]:
#fit pipes to check which is best
pipes = [svc_pipe, rfc_pipe, xgb_pipe]

for pipe in pipes:
  pipe.fit(X_train, y_train)

#compare accuracies 
#make dictionary
pipe_dict = {0: 'SVC', 1: 'Random Forest', 2: 'XGBoost'}

#create for loop to find scores
for i, model in enumerate(pipes):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

SVC Test Accuracy: 0.39609777777777777
Random Forest Test Accuracy: 0.6154133333333334
XGBoost Test Accuracy: 0.5536622222222222


## Pickle base random forest for trial on AISC data

In [15]:
#Instantiate pipeline
rfc_model = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

#Fit model
rfc_model.fit(X_train, y_train)

#Double check accuracy
rfc_model.score(X_test, y_test)

0.6182311111111111

In [16]:
#Instantiate pipeline
rfc = Pipeline([('RFC', RandomForestClassifier(random_state = 123))])

#Fit model
rfc.fit(X_train, y_train)

#Double check accuracy
rfc.score(X_test, y_test)

0.6182311111111111

In [19]:
#Save as pickle file
pickle.dump(rfc, open('/content/drive/MyDrive/classifier.pickle', 'wb'))

In [20]:
#Load pickle
with open('/content/drive/MyDrive/classifier.pickle', 'rb') as f:
  classifier = pickle.load(f)

classifier.score(X_test, y_test)

0.6182311111111111