<a href="https://colab.research.google.com/github/MudSnail/Land_Cover_Classification/blob/main/Basic_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#import base libraries
import pandas as pd
import numpy as np
import copy
import joblib
import cloudpickle

#Sklearn - Pipelines, Training
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

#Classification Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#Model metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#misc
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Input Data, Combine and Select

In [15]:
#Read in CSVs
james_bay = pd.read_csv('/content/drive/MyDrive/james_bay_data.csv')
labrador = pd.read_csv('/content/drive/MyDrive/labrador_data.csv')
simcoe = pd.read_csv('/content/drive/MyDrive/simcoe_york_data.csv')

In [16]:
#List dataframes to add to james_bay
dfs = [labrador, simcoe]

#Merged data
merged_data = james_bay.append(dfs)

#Print shape and check new data
print(merged_data.shape)
merged_data.head()

(8917500, 13)


Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.0055,0.0017,0.0009,0.0016,0.0026,0.0056,0.0001,0.0009,0.0003,0.0032,0.0012,0.0114,0
1,0.0054,0.002,0.0009,0.0017,0.0021,0.006,0.0001,0.0005,0.0007,0.0028,0.0014,0.0115,0
2,0.0058,0.0018,0.0007,0.0016,0.0028,0.006,0.0001,0.0008,0.0007,0.003,0.0012,0.0124,0
3,0.0057,0.0016,0.0007,0.0016,0.0023,0.0058,0.0001,0.0012,0.0006,0.0031,0.0011,0.0119,0
4,0.0066,0.0013,0.0008,0.002,0.0029,0.0059,0.0001,0.0008,0.0005,0.003,0.0014,0.0115,0


In [17]:
#Check classifications values
print(merged_data.Class.value_counts())

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
0         961
Name: Class, dtype: int64


In [18]:
#Drop Class = 0 (null values, which equate to 18)
merged_data = merged_data[merged_data.Class != 0]

#Double check value counts
merged_data.Class.value_counts()

15    2836714
18    1050988
1      677877
5      666952
16     646870
13     612009
8      575175
17     512930
12     415493
6      378274
2      151560
11     135890
14      94850
10      86338
19      74619
Name: Class, dtype: int64

In [19]:
#SubSample ~25 000 for each class
sample = merged_data.groupby('Class').apply(lambda x: x.sample(n=25000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B01,B06,B11,B07,B05,B03,B09,B12,B8A,B04,B08,B02,Class
0,0.0121,0.121,0.1029,0.1366,0.0603,0.0338,0.1531,0.0538,0.1508,0.0245,0.1443,0.0203,1
1,0.017,0.1392,0.1671,0.1553,0.0881,0.0462,0.1764,0.1046,0.1759,0.0478,0.1694,0.0327,1
2,0.0202,0.1308,0.1318,0.1462,0.0802,0.0444,0.1608,0.0748,0.1588,0.0408,0.1527,0.0302,1
3,0.0191,0.1189,0.124,0.1332,0.0705,0.0399,0.1473,0.0743,0.146,0.0382,0.1414,0.0287,1


# Split data

In [20]:
#Set x and y
y = sample['Class']
X = sample.drop('Class', axis = 1)

#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

# Create Pipelines

In [6]:
#Functions for Pipeline
def add_layers(df):
  """
  This function takes in a dataframe and calculates the NDVI, Moisture Index, NDWI and NDSI
  Outputs = dataframe with added layer columns 
  """
  #Create NDVI column (B08-B04)/(B08+B04)
  df['NDVI'] = (df.B08 - df.B04)/(df.B08 + df.B04)
  #Create Moisture index (B8A-B11)/(B8A+B11)
  df['Moisture'] = (df.B8A - df.B11)/(df.B8A + df.B11)
  #Create NDWI (B3-B8)/(B3+B8)
  df['NDWI'] = (df.B03 - df.B08)/(df.B03 + df.B08)
  #create NDSI (B3-B11)/(B3+B11)
  df['NDSI'] = (df.B03 - df.B11)/(df.B03 + df.B11)

  return df

def replace_values(df):
  """
  This function replaces the infinity values with Nan then replaces that with new infinity values
  """
  #Replace infinity values with Nan
  df = df.replace([np.inf, -np.inf], np.nan, inplace=True)

  #Fill in infinity values
  df = df.fillna(999, inplace=True)

  return df


#Define object transformer class
class objectTransformer:
    """
    Class object transformer, takes in function to apply in preprocessing step of Pipeline
    """
    def __init__(self, func):
        self.func = func
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **fit_params):
        return self.func(X)

In [8]:
#Create Pre-Processing Pipeline
preprocess = Pipeline([("add_layers", objectTransformer(add_layers)),
                       ("replace_values", objectTransformer(replace_values))])

#Create Feature Union
feature_union = FeatureUnion([("PCA", PCA(n_components=3)),
                              ("kKBest", SelectKBest(k=3))])

In [None]:
#Define pipelines
model_1 = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])

model_1 = Pipeline([('preprocessing', preprocess),
                    ('RFC', RandomForestClassifier(random_state = 123))])