<a href="https://colab.research.google.com/github/MudSnail/Land_Cover_Classification/blob/main/Basic_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import base libraries
import pandas as pd
import numpy as np
import copy
import pickle
import cloudpickle

#Sklearn - Pipelines, Training
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

#Classification Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


#Model metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#misc
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Input Data, Combine and Select

In [3]:
#Read in CSVs
james_bay = pd.read_csv('/content/drive/MyDrive/james_bay_data.csv')
labrador = pd.read_csv('/content/drive/MyDrive/labrador_data.csv')
simcoe = pd.read_csv('/content/drive/MyDrive/simcoe_york_data.csv')

In [4]:
#List dataframes to add to james_bay
dfs = [labrador, simcoe]

#Merged data
merged_data = james_bay.append(dfs)

#Print shape and check new data
print(merged_data.shape)
merged_data.head()

(8917500, 13)


Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.0055,0.0017,0.0009,0.0016,0.0026,0.0056,0.0001,0.0009,0.0003,0.0032,0.0012,0.0114,0
1,0.0054,0.002,0.0009,0.0017,0.0021,0.006,0.0001,0.0005,0.0007,0.0028,0.0014,0.0115,0
2,0.0058,0.0018,0.0007,0.0016,0.0028,0.006,0.0001,0.0008,0.0007,0.003,0.0012,0.0124,0
3,0.0057,0.0016,0.0007,0.0016,0.0023,0.0058,0.0001,0.0012,0.0006,0.0031,0.0011,0.0119,0
4,0.0066,0.0013,0.0008,0.002,0.0029,0.0059,0.0001,0.0008,0.0005,0.003,0.0014,0.0115,0


In [5]:
#Check classifications values
print(merged_data.Class.value_counts())

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
0         961
Name: Class, dtype: int64


In [6]:
#Drop Class = 0 (null values, which equate to 18)
merged_data = merged_data[merged_data.Class != 0]

#Double check value counts
merged_data.Class.value_counts()

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
Name: Class, dtype: int64

In [7]:
#SubSample ~25 000 for each class
sample = merged_data.groupby('Class').apply(lambda x: x.sample(n=25000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.007,0.1489,0.1105,0.1698,0.0726,0.0368,0.1884,0.0554,0.186,0.0277,0.1826,0.0184,1
1,0.022,0.1342,0.1121,0.1552,0.0678,0.0364,0.1849,0.0606,0.1715,0.0291,0.1619,0.024,1
2,0.0078,0.1041,0.081,0.1176,0.0499,0.026,0.1347,0.0421,0.1301,0.0186,0.1244,0.0157,1
3,0.0124,0.1512,0.1283,0.1726,0.0845,0.0367,0.1936,0.0707,0.1981,0.0384,0.1885,0.0216,1


# Split data

In [8]:
#Set x and y
y = sample['Class']
X = sample.drop('Class', axis = 1)

#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [9]:
#check shape
X_train.shape, y_train.shape

((262500, 12), (262500,))

# Create Pipelines

In [10]:
#Functions for Pipeline
def add_layers(df):
  """
  This function takes in a dataframe and calculates the NDVI, Moisture Index, NDWI and NDSI
  Outputs = dataframe with added layer columns 
  """
  #Create NDVI column (B08-B04)/(B08+B04)
  df['NDVI'] = (df.B08 - df.B04)/(df.B08 + df.B04)
  #Create Moisture index (B8A-B11)/(B8A+B11)
  df['Moisture'] = (df.B8A - df.B11)/(df.B8A + df.B11)
  #Create NDWI (B3-B8)/(B3+B8)
  df['NDWI'] = (df.B03 - df.B08)/(df.B03 + df.B08)
  #create NDSI (B3-B11)/(B3+B11)
  df['NDSI'] = (df.B03 - df.B11)/(df.B03 + df.B11)

  return df

def replace_values(df):
  """
  This function replaces the infinity values with Nan then replaces that with new infinity values
  """
  #Replace infinity values with Nan
  df.replace([np.inf, -np.inf], np.NAN, inplace=True)

  #Fill in null values
  df.fillna(999, inplace=True)

  return df


#Define object transformer class
class objectTransformer:
    """
    Class object transformer, takes in function to apply in preprocessing step of Pipeline
    """
    def __init__(self, func):
        self.func = func
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **fit_params):
        return self.func(X)

## Test PCA and Feature Union

In [11]:
#Create Pre-Processing Pipeline
preprocess = Pipeline([("add_layers", objectTransformer(add_layers)),
                       ("replace_values", objectTransformer(replace_values))])

#Create Feature Union
feature_union = FeatureUnion([("PCA", PCA(n_components=6)),
                              ("kKBest", SelectKBest(k=2))])


In [None]:
#Define pipelines
model_1 = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

model_2 = Pipeline([('preprocessing', preprocess),
                    ('features', feature_union),
                    ('RFC', RandomForestClassifier(random_state = 123))])

model_3 = Pipeline([('preprocessing', preprocess),
                    ('pca', PCA(n_components=6)),
                    ('RFC', RandomForestClassifier(random_state = 123))])

In [None]:
#fit pipes to check which is best
pipes = [model_1, model_2, model_3]

for pipe in pipes:
  pipe.fit(X_train, y_train)

#compare accuracies 
#make dictionary
pipe_dict = {0: 'No Feature Union', 1: 'With Feature Union', 2: 'Only PCA'}

#create for loop to find scores
for i, model in enumerate(pipes):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

No Feature Union Test Accuracy: 0.6150577777777778
With Feature Union Test Accuracy: 0.5676888888888889
Only PCA Test Accuracy: 0.5371822222222222


## Compare SVC, Random Forest, XGBoost

In [13]:
#Compare three model types
svc_pipe = Pipeline([('preprocessing', preprocess),
                    ('SVC', SVC(random_state = 123))])

knn_pipe = Pipeline([('preprocessing', preprocess),
                    ('KNN', KNeighborsClassifier())])

rfc_pipe = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

xgb_pipe = Pipeline([('preprocessing', preprocess),
                    ('XGB', XGBClassifier(random_state = 123))])

gbc_pipe = Pipeline([('preprocessing', preprocess),
                    ('GB', GradientBoostingClassifier(random_state = 123))])

In [14]:
#fit pipes to check which is best
pipes = [svc_pipe, knn_pipe, rfc_pipe, xgb_pipe, gbc_pipe]

for pipe in pipes:
  pipe.fit(X_train, y_train)

#compare accuracies 
#make dictionary
pipe_dict = {0: 'SVC', 1: 'KNN', 2: 'Random Forest', 3: 'XGBoost', 4:'GradientBoost'}

#create for loop to find scores
for i, model in enumerate(pipes):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

SVC Test Accuracy: 0.39123555555555556
KNN Test Accuracy: 0.5426755555555556
Random Forest Test Accuracy: 0.6166755555555555
XGBoost Test Accuracy: 0.5533155555555556
GradientBoost Test Accuracy: 0.5793244444444444


In [None]:
#create for loop to find balanced accuracy scores
for i, model in enumerate(pipes):
  y_pred = model.predict(X_test)
  print('{} Test Accuracy: {}'.format(pipe_dict[i], balanced_accuracy_score(y_test, y_pred)))

## Test scalers

In [None]:
#Define pipelines
stand = Pipeline([('preprocessing', preprocess),
                     ('scale', StandardScaler()),
                    ('RFC', RandomForestClassifier(random_state = 123))])

minmax = Pipeline([('preprocessing', preprocess),
                     ('scale', MinMaxScaler()),
                    ('RFC', RandomForestClassifier(random_state = 123))])

maxab = Pipeline([('preprocessing', preprocess),
                     ('scale', MaxAbsScaler()),
                    ('RFC', RandomForestClassifier(random_state = 123))])

robust = Pipeline([('preprocessing', preprocess),
                     ('scale', RobustScaler()),
                    ('RFC', RandomForestClassifier(random_state = 123))])

In [None]:
#fit pipes to check which is best
pipes = [stand, minmax, maxab, robust]

for pipe in pipes:
  pipe.fit(X_train, y_train)

#compare accuracies 
#make dictionary
pipe_dict = {0: 'StandardScaler', 1: 'MinMaxScaler', 2: 'MaxAbsScaler', 3: 'RobustScaler'}

#create for loop to find scores
for i, model in enumerate(pipes):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

## Pickle base random forest for trial on AISC data

In [None]:
#Instantiate pipeline
rfc = Pipeline([('RFC', RandomForestClassifier(random_state = 123))])

#Fit model
rfc.fit(X_train, y_train)

#Double check accuracy
rfc.score(X_test, y_test)

0.6176711111111111

In [None]:
#Save as pickle file
pickle.dump(rfc, open('/content/drive/MyDrive/classifier.pickle', 'wb'))

In [None]:
#Load pickle
with open('/content/drive/MyDrive/classifier.pickle', 'rb') as f:
  classifier = pickle.load(f)

classifier.score(X_test, y_test)

0.6176711111111111

In [None]:
#Instantiate pipeline
rfc_model = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

#Fit model
rfc_model.fit(X_train, y_train)

#Double check accuracy
rfc_model.score(X_test, y_test)

0.6150844444444444

In [None]:
#Save as pickle file
pickle.dump(rfc_model, open('/content/drive/MyDrive/model.pickle', 'wb'))

In [None]:
#Load pickle
with open('/content/drive/MyDrive/model.pickle', 'rb') as p:
  model = pickle.load(p)

model.score(X_test, y_test)

0.6150844444444444