<a href="https://colab.research.google.com/github/MudSnail/Land_Cover_Classification/blob/main/Simple_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import base libraries
import pandas as pd
import numpy as np
import copy
import pickle
import cloudpickle

#Sklearn - Pipelines, Training
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

#Classification Models
from sklearn.ensemble import RandomForestClassifier

#Model metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#misc
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def get_scores(model, x, y):
  """
  Input: model, x = X_test variable, y = y_test variable
  Output: The scores and classification report for the model
  """
  #Predict X_test
  y_prediction = model.predict(x)

  #Print measure scores
  print(f"Accuracy: {accuracy_score(y, y_prediction)}")
  print(f"Balanced Accuracy: {balanced_accuracy_score(y, y_prediction)}")
  print(f"F1 Score: {f1_score(y, y_prediction, average='micro')}")
  print(f"Precision Score: {precision_score(y, y_prediction, average='micro')}")
  print(f"Recall Score: {recall_score(y, y_prediction, average='micro')}")
  print(f"ROC AUC Score: {roc_auc_score(y, model.predict_proba(x),multi_class='ovr', average='macro')}")

  #print full classification report
  print(classification_report(y, y_prediction))

# Test James South Data

In [4]:
#Read in CSVs
james_bay_south = pd.read_csv('/content/drive/MyDrive/AISC/james_bay_south_data.csv')

#Print Shape
print(james_bay_south.shape)

#print class value counts
print(james_bay_south.Class.value_counts())

(1065000, 13)
1     331384
14    270405
2     138715
18    133193
8      84417
5      39308
12     28424
6      27479
10     11193
17       297
16       169
13        16
Name: Class, dtype: int64


In [5]:
#Drop classes 13, 16, 17
values = [13, 16, 17]
james_bay_south = james_bay_south[james_bay_south.Class.isin(values) == False]

#SubSample ~10000 for each class
sample = james_bay_south.groupby('Class').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B01,B11,B08,B8A,B02,B07,B12,B09,B04,B06,B05,B03,Class
0,0.0199,0.138,0.2072,0.2147,0.0238,0.1941,0.0679,0.216,0.0323,0.1744,0.0864,0.0463,1
1,0.0257,0.1756,0.0994,0.1049,0.0244,0.0935,0.1283,0.1067,0.0428,0.085,0.0599,0.0363,1
2,0.0256,0.1571,0.2372,0.2477,0.0284,0.2223,0.0828,0.248,0.0449,0.1983,0.1078,0.0528,1
3,0.0109,0.0995,0.1755,0.1788,0.0157,0.1637,0.0495,0.1795,0.0214,0.1445,0.061,0.0344,1


In [6]:
#Split the Data
y = sample['Class']
X = sample.drop(['Class'], axis = 1)

#Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [7]:
#Instantiate Random Forest
rfc = RandomForestClassifier(random_state=42)

#Fit training data
rfc.fit(X_train, y_train)

#print scores
get_scores(rfc, X_test, y_test)

Accuracy: 0.6063333333333333
Balanced Accuracy: 0.6066490877996946
F1 Score: 0.6063333333333333
Precision Score: 0.6063333333333333
Recall Score: 0.6063333333333333
ROC AUC Score: 0.9193966905454101
              precision    recall  f1-score   support

           1       0.53      0.55      0.54      1990
           2       0.66      0.67      0.66      1971
           5       0.61      0.62      0.61      1981
           6       0.51      0.51      0.51      2042
           8       0.54      0.52      0.53      2025
          10       0.63      0.56      0.59      1993
          12       0.74      0.75      0.75      2061
          14       0.44      0.45      0.44      1982
          18       0.80      0.84      0.82      1955

    accuracy                           0.61     18000
   macro avg       0.61      0.61      0.61     18000
weighted avg       0.61      0.61      0.61     18000



# Test Newfoundland

In [11]:
#Read in CSVs
newfoundland = pd.read_csv('/content/drive/MyDrive/AISC/newfoundland_data.csv')

#Print Shape
print(newfoundland.shape)

#print class value counts
print(newfoundland.Class.value_counts())

(1810000, 13)
8     496206
1     422038
5     320971
18    255995
6     255229
14     53168
12      5205
17       788
16       292
13        87
10        21
Name: Class, dtype: int64


In [12]:
#Drop classes < 10000 pixels
values = [10,12,13,16,17]
newfoundland = newfoundland[newfoundland.Class.isin(values) == False]

#SubSample ~10000 for each class
sample = newfoundland.groupby('Class').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B09,B8A,B05,B02,B01,B11,B04,B06,B07,B03,B08,B12,Class
0,0.2898,0.2768,0.1065,0.022,0.0238,0.1458,0.0378,0.2201,0.2503,0.0547,0.2855,0.0719,1
1,0.2794,0.2512,0.1116,0.0256,0.0174,0.1618,0.052,0.1861,0.214,0.0436,0.2457,0.0874,1
2,0.2635,0.2633,0.0918,0.0211,0.0135,0.1187,0.0311,0.2028,0.2473,0.0525,0.2657,0.0548,1
3,0.2294,0.2297,0.091,0.0176,0.013,0.1172,0.0301,0.1906,0.2173,0.0433,0.2407,0.0588,1


In [13]:
#Split the Data
y = sample['Class']
X = sample.drop(['Class'], axis = 1)

#Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [14]:
#Instantiate Random Forest
model = RandomForestClassifier(random_state=42)

#Fit training data
model.fit(X_train, y_train)

#print scores
get_scores(model, X_test, y_test)

Accuracy: 0.5786666666666667
Balanced Accuracy: 0.5785628363935439
F1 Score: 0.5786666666666667
Precision Score: 0.5786666666666667
Recall Score: 0.5786666666666667
ROC AUC Score: 0.8711248114242621
              precision    recall  f1-score   support

           1       0.45      0.48      0.46      2004
           5       0.55      0.54      0.55      1959
           6       0.54      0.56      0.55      1946
           8       0.60      0.48      0.53      2068
          14       0.55      0.58      0.57      1999
          18       0.78      0.82      0.80      2024

    accuracy                           0.58     12000
   macro avg       0.58      0.58      0.58     12000
weighted avg       0.58      0.58      0.58     12000



# Test Calgary

In [15]:
#Read in CSVs
calgary = pd.read_csv('/content/drive/MyDrive/AISC/calgary_data.csv')

#Print Shape
print(calgary.shape)

#print class value counts
print(calgary.Class.value_counts())

(1157500, 13)
1     387764
15    291201
16    147852
10    137306
8      52635
17     40350
6      34001
5      30197
2      15348
18     15090
14      2802
19      2373
12       423
11       158
Name: Class, dtype: int64


In [16]:
#Drop classes < 10000 pixels
values = [11,12,14,19]
calgary = calgary[calgary.Class.isin(values) == False]

#SubSample ~10000 for each class
sample = calgary.groupby('Class').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B02,B08,B06,B8A,B03,B11,B01,B05,B07,B09,B12,B04,Class
0,0.0231,0.1775,0.1474,0.1829,0.0398,0.1243,0.0118,0.0646,0.1703,0.1817,0.0667,0.0273,1
1,0.0324,0.1407,0.1178,0.145,0.0409,0.0998,0.0249,0.061,0.135,0.2114,0.0724,0.0333,1
2,0.0174,0.1641,0.1344,0.1676,0.0303,0.091,0.0073,0.0515,0.1565,0.1674,0.0459,0.0201,1
3,0.0187,0.164,0.1299,0.1642,0.0336,0.0975,0.0069,0.0525,0.154,0.166,0.0584,0.0235,1


In [17]:
#Split the Data
y = sample['Class']
X = sample.drop(['Class'], axis = 1)

#Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [18]:
#Instantiate Random Forest
model2 = RandomForestClassifier(random_state=42)

#Fit training data
model2.fit(X_train, y_train)

#print scores
get_scores(model2, X_test, y_test)

Accuracy: 0.54545
Balanced Accuracy: 0.545466802048538
F1 Score: 0.54545
Precision Score: 0.54545
Recall Score: 0.54545
ROC AUC Score: 0.8850075375561477
              precision    recall  f1-score   support

           1       0.60      0.64      0.62      2005
           2       0.57      0.69      0.62      2013
           5       0.46      0.52      0.49      1977
           6       0.48      0.53      0.50      2038
           8       0.33      0.24      0.28      2002
          10       0.32      0.27      0.29      2000
          15       0.54      0.65      0.59      1985
          16       0.76      0.74      0.75      2002
          17       0.61      0.63      0.62      1992
          18       0.77      0.55      0.64      1986

    accuracy                           0.55     20000
   macro avg       0.54      0.55      0.54     20000
weighted avg       0.54      0.55      0.54     20000



# Test Trois, Qc

In [19]:
#Read in CSVs
trois = pd.read_csv('/content/drive/MyDrive/AISC/trois_data.csv')

#Print Shape
print(trois.shape)

#print class value counts
print(trois.Class.value_counts())

(820000, 13)
6     271199
5     221241
15    153148
18     59871
1      50155
17     44434
14      7515
8       5414
16      5168
10      1855
Name: Class, dtype: int64


In [20]:
#Drop classes < 10000 pixels
values = [8, 10,14,16]
trois = trois[trois.Class.isin(values) == False]

#SubSample ~10000 for each class
sample = trois.groupby('Class').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B01,B04,B05,B08,B12,B03,B06,B07,B02,B09,B11,B8A,Class
0,0.0097,0.017,0.0613,0.2484,0.0444,0.0319,0.19,0.2267,0.0118,0.2563,0.1044,0.2492,1
1,0.0092,0.0216,0.0806,0.284,0.055,0.0382,0.2236,0.2589,0.0136,0.29,0.1274,0.2819,1
2,0.0101,0.0173,0.0625,0.2259,0.0404,0.0328,0.1803,0.2093,0.0134,0.2347,0.0929,0.2296,1
3,0.009,0.0151,0.0561,0.1844,0.0279,0.0322,0.1474,0.1677,0.0123,0.1789,0.0652,0.1807,1


In [21]:
#Split the Data
y = sample['Class']
X = sample.drop(['Class'], axis = 1)

#Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [23]:
#Instantiate Random Forest
model3 = RandomForestClassifier(random_state=42)

#Fit training data
model3.fit(X_train, y_train)

#print scores
get_scores(model3, X_test, y_test)

Accuracy: 0.6845833333333333
Balanced Accuracy: 0.6820587002125053
F1 Score: 0.6845833333333333
Precision Score: 0.6845833333333333
Recall Score: 0.6845833333333333
ROC AUC Score: 0.9182781444692569
              precision    recall  f1-score   support

           1       0.62      0.69      0.66      2004
           5       0.57      0.58      0.57      1959
           6       0.46      0.42      0.44      1946
          15       0.77      0.78      0.78      2068
          17       0.76      0.73      0.74      1999
          18       0.92      0.89      0.90      2024

    accuracy                           0.68     12000
   macro avg       0.68      0.68      0.68     12000
weighted avg       0.68      0.68      0.68     12000



# Test Medicine Hat, SK

In [24]:
#Read in CSVs
sask = pd.read_csv('/content/drive/MyDrive/AISC/saskatchewan_data.csv')

#Print Shape
print(sask.shape)

#print class value counts
print(sask.Class.value_counts())

(2043750, 13)
1     672677
18    518343
5     347911
8     286962
6     118658
10     79017
14     14771
16      3552
17      1785
2         74
Name: Class, dtype: int64


In [25]:
#Drop classes < 10000 pixels
values = [2,16,17]
sask = sask[sask.Class.isin(values) == False]

#SubSample ~10000 for each class
sample = sask.groupby('Class').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
sample.head(4)

Unnamed: 0,B08,B02,B07,B03,B11,B05,B09,B01,B06,B8A,B12,B04,Class
0,0.1835,0.0184,0.1753,0.037,0.1238,0.0738,0.13,0.0082,0.157,0.1868,0.0703,0.032,1
1,0.1358,0.011,0.1299,0.0247,0.0547,0.0429,0.1423,0.008,0.113,0.1371,0.0237,0.0138,1
2,0.1338,0.014,0.1275,0.0296,0.0899,0.0517,0.0865,0.0098,0.1127,0.1425,0.0522,0.0214,1
3,0.2014,0.0225,0.1789,0.041,0.1285,0.0901,0.1392,0.0091,0.1631,0.2044,0.0669,0.037,1


In [26]:
#Split the Data
y = sample['Class']
X = sample.drop(['Class'], axis = 1)

#Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [27]:
#Instantiate Random Forest
model4 = RandomForestClassifier(random_state=123)

#Fit training data
model4.fit(X_train, y_train)

#print scores
get_scores(model4, X_test, y_test)

Accuracy: 0.5648571428571428
Balanced Accuracy: 0.5657712548361392
F1 Score: 0.5648571428571428
Precision Score: 0.5648571428571428
Recall Score: 0.5648571428571428
ROC AUC Score: 0.8640380891902942
              precision    recall  f1-score   support

           1       0.49      0.54      0.52      2008
           5       0.44      0.50      0.47      1980
           6       0.43      0.44      0.44      1961
           8       0.38      0.33      0.35      2052
          10       0.70      0.69      0.70      2037
          14       0.68      0.56      0.61      1997
          18       0.85      0.89      0.87      1965

    accuracy                           0.56     14000
   macro avg       0.57      0.57      0.56     14000
weighted avg       0.57      0.56      0.56     14000

