# Cloud detection

In [None]:
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import gzip

%matplotlib inline

## Data preprocessing

#### Assumption

Data is given in such a format that each data row represents one pixel in one satellite image. One row consists of 13 band values and a label associated with the particular pixel.

#### Let's load the training data

In [None]:
# First satellite image
with open('PNL_A018780_20201010_60m.csv', 'rb') as fd1:
    gzip_fd1 = gzip.GzipFile(fileobj=fd1)
    data1 = pd.read_csv(gzip_fd1)

# Check print
#print(data1.iloc[0])

In [None]:
# Second satellite image
with open('PQM_A018880_20201017_60m.csv', 'rb') as fd2:
    gzip_fd2 = gzip.GzipFile(fileobj=fd2)
    data2 = pd.read_csv(gzip_fd2)

# Check print
#print(data2.iloc[0])

In [None]:
# Let's combine the pixels of the separately downloaded training satellite images
# NOTE. You could download more images (and thus construct a larger training set) if you wish
data = pd.concat([data1, data2], ignore_index=True)

In [None]:
xallpd = data.iloc[:, 2:]
indpd = data.iloc[:,0]
yallpd = data.iloc[:,1]

# Check prints
#print(xallpd.iloc[3])
#print(indpd.iloc[3])
#print(yallpd.iloc[3])
#print(xallpd.shape)
#print(yallpd.shape)

Let's transform the pandas dataframes into numpy arrays and convert the labels from 1/2 to 0/1.

In [None]:
xall = pd.DataFrame(xallpd).to_numpy()
indall = pd.DataFrame(indpd).to_numpy()
yall = pd.DataFrame(yallpd).to_numpy()

# Check prints
#print(np.shape(xall))
#print(np.shape(yall))

#print(np.count_nonzero(yall==1))
#print(np.count_nonzero(yall==2))

yall[yall==1] = 0
yall[yall==2] = 1

#print(np.count_nonzero(yall==0))
#print(np.count_nonzero(yall==1))

<font size=4, color='blue'>NOTE.</font>
    
    0 = no cloud pixel
    1 = cloud pixel

#### Let's load the test data

In [None]:
with open('PNM_A027474_20200925T082711_60m.csv', 'rb') as fd3:
    gzip_fd3 = gzip.GzipFile(fileobj=fd3)
    data3 = pd.read_csv(gzip_fd3)

# Check print
#print(data3.iloc[0])

In [None]:
xallpd_test = data3.iloc[:, 2:]
indpd_test = data3.iloc[:,0]
yallpd_test = data3.iloc[:,1]

# Check prints
#print(xallpd_test.iloc[3])
#print(indpd_test.iloc[3])
#print(yallpd_test.iloc[3])
#print(xallpd_test.shape)
#print(yallpd_test.shape)

Let's transform the pandas dataframes into numpy arrays and convert the labels from 1/2 to 0/1.

In [None]:
xall_test = pd.DataFrame(xallpd_test).to_numpy()
indall_test = pd.DataFrame(indpd_test).to_numpy()
yall_test = pd.DataFrame(yallpd_test).to_numpy()

# Check prints
#print(np.shape(xall_test))
#print(np.shape(yall_test))

#print(np.count_nonzero(yall_test==1))
#print(np.count_nonzero(yall_test==2))

yall_test[yall_test==1] = 0
yall_test[yall_test==2] = 1

#print(np.count_nonzero(yall_test==0))
#print(np.count_nonzero(yall_test==1))

### Principal Component Analysis

If we could use only three key components to derive predictions instead of 13 features the running times could in theory decrease by 2/3.

#### For training data

In [None]:
from sklearn.decomposition import PCA

# Compute the 3 first principal components for training data
pca = PCA(n_components=3)
pcacomp = pca.fit_transform(xall)

xall = pcacomp

<font size=4, color='blue'>Consider if you really want to produce these figures!</font>

<font size=4, color='blue'>Running these scripts is much slower than calculating the principal components..</font>

In [None]:
import seaborn as sns

# Create the plot 1 to illustrate the PCA results.
color_mapping = {0: sns.xkcd_rgb['bright purple'], 1: sns.xkcd_rgb['lime'], 2: sns.xkcd_rgb['ochre']}
colors = list(map(lambda x: color_mapping[x], yall.flatten()))
plt.scatter(pcacomp[:,0], pcacomp[:,1], s=20, c=colors)
plt.title("3 component PCA")
plt.xlabel("Pca comp 1")
plt.ylabel("Pca comp 2")
plt.show()

In [None]:
# Create the plot 2 to illustrate the PCA results.
color_mapping = {0: sns.xkcd_rgb['bright purple'], 1: sns.xkcd_rgb['lime'], 2: sns.xkcd_rgb['ochre']}
colors = list(map(lambda x: color_mapping[x], yall.flatten()))
plt.scatter(pcacomp[:,0], pcacomp[:,2], s=20, c=colors)
plt.title("3 component PCA")
plt.xlabel("Pca comp 1")
plt.ylabel("Pca comp 3")
plt.show()

In [None]:
# Create the plot 3 to illustrate the PCA results.
color_mapping = {0: sns.xkcd_rgb['bright purple'], 1: sns.xkcd_rgb['lime'], 2: sns.xkcd_rgb['ochre']}
colors = list(map(lambda x: color_mapping[x], yall.flatten()))
plt.scatter(pcacomp[:,1], pcacomp[:,2], s=20, c=colors)
plt.title("3 component PCA")
plt.xlabel("Pca comp 2")
plt.ylabel("Pca comp 3")
plt.show()

#### For test data

In [None]:
# Compute the 3 first principal components for test data
pca_test = PCA(n_components=3)
pcacomp_test = pca_test.fit_transform(xall_test)

xall_test = pcacomp_test

### Standardization

#### For training data

In [None]:
from sklearn import preprocessing
import random

# Let's shuffle the data
alltog1 = np.append(xall, indall, axis=1)
alltog = np.append(alltog1, yall, axis=1)
np.random.shuffle(alltog)

xall = alltog[:,:-2]
indall = alltog[:,-2]
yall = alltog[:,-1]

x_train = xall
y_train = yall

scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

# Check prints
#print(np.shape(x_train))
#print(np.shape(y_train))

#### For test data

In [None]:
# Let's shuffle the data
alltog1_test = np.append(xall_test, indall_test, axis=1)
alltog_test = np.append(alltog1_test, yall_test, axis=1)
np.random.shuffle(alltog_test)

xall_test = alltog_test[:,:-2]
indall_test = alltog_test[:,-2]
yall_test = alltog_test[:,-1]

x_test = xall_test
y_test = yall_test

scaler2 = preprocessing.StandardScaler().fit(x_test)
x_test = scaler2.transform(x_test)

# Check prints
#print(np.shape(x_test))
#print(np.shape(y_test))

## Classifier 1: K Nearest Neighbours <font color='orange'>-QUITE QUICK</font>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Natural choice since in practice every pixel has 8 neighbouring pixels
# Try out different values and see how it affects prediction accuracy!
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(x_train,y_train.flatten())
knnpre = knn.predict(x_test)

y_knnpre = np.append(knnpre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

# Check prints
#print(knnpre[:10])
#print(y_test.flatten()[:10])

# Naive implementation for calculating prediction accuracy
#knnsum = sum(x == y for x, y in zip(knnpre, y_test.flatten()))
#knnacc = knnsum / float(len(knnpre))
#print(knnacc)

print(knn.score(x_train, y_train))
print(knn.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('knnclouds.txt', y_knnpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(knnpre==0))
print(np.count_nonzero(knnpre==1))

## Classifier 2: Ridge regression <font color='green'>-QUICK</font>

In [None]:
from sklearn.linear_model import RidgeClassifier

# Quite typical choice
# Try out different values and see how it affects prediction accuracy!
alp = 1e-1

rire = RidgeClassifier(alpha=alp)
rire.fit(x_train, y_train.flatten())
ripre = rire.predict(x_test)

y_ripre = np.append(ripre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

# Check prints
#print(ripre[:10])
#print(y_ripre[:10])
#print(y_test.flatten()[:10])

print(rire.score(x_train, y_train))
print(rire.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('ridgeclouds.txt', y_ripre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(ripre==0))
print(np.count_nonzero(ripre==1))

## Classifier 3: Logistic regression <font color='green'>-QUICK</font>

In [None]:
from sklearn.linear_model import LogisticRegression

# Try out different C values and see how it affects prediction accuracy!
logre = LogisticRegression(C=1e-6)
logre.fit(x_train, y_train.flatten())
logpre = logre.predict(x_test)

y_logpre = np.append(logpre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

# Check prints
#print(logpre[:10])
#print(y_test.flatten()[:10])

print(logre.score(x_train, y_train))
print(logre.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('logreclouds.txt', y_logpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(logpre==0))
print(np.count_nonzero(logpre==1))

## Classifier 4: Support Vector Machines <font color='green'>-QUICK</font>

In [None]:
from sklearn.svm import LinearSVC

# Try out different C values and see how it affects prediction accuracy!
svm = LinearSVC(C=1e-8, dual=False)
svm.fit(x_train, y_train.flatten())

svmpre = svm.predict(x_test)

y_svmpre = np.append(svmpre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

print(svm.score(x_train, y_train))
print(svm.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('svmclouds.txt', y_svmpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(svmpre==0))
print(np.count_nonzero(svmpre==1))

## Classifier 4.2: SVM version 2 <font color='red'>-SLOW</font>

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

clf = make_pipeline(SVC(gamma='auto'))

clf.fit(x_train, y_train.flatten())

svmpre2 = clf.predict(x_test)

y_svmpre2 = np.append(svmpre2.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('svmclouds2.txt', y_svmpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(svmpre2==0))
print(np.count_nonzero(svmpre2==1))

<font size=4, color='blue'>Works well only with small amount of data!</font>

## Classifier 5: Multilayer Perceptron <font color='red'>-SLOW / DIES DURING EXECUTION</font>

In [None]:
from tensorflow import keras
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense

# Let's make results reproducable
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

early_stopping_monitor = EarlyStopping(patience=5)

model = Sequential()
n_cols = np.shape(x_train)[1]
model.add(Dense(2, activation='relu', input_shape=(n_cols,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("fit begins")
model.fit(x_train, y_train, epochs=32, batch_size=12, validation_split=0.2, callbacks=[early_stopping_monitor],
          use_multiprocessing=True, verbose=0)
print("predicting begins")
mlppre = model.predict(x_test)

y_mlppre = np.append(mlppre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

# Check prints
#print(mlppre[:10])
#print(y_test.flatten()[:10])

# Naive implementation for calculating accuracy
#mlpsum = sum((x >= 0.5 and y==1) or (x<0.5 and y==0) for x, y in zip(mlppre, y_test.flatten()))
#mlpacc = mlpsum / float(len(mlpennu))
#print(mlpacc)

print(model.evaluate(x_train, y_train)[1])
print(model.evaluate(x_test, y_test)[1])

# Let's save the obtained predictions into a txt file
np.savetxt('mlpclouds.txt', y_mlppre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(mlppre<0.5))
print(np.count_nonzero(mlppre>=0.5))

##  Classifier 5.2: MLP version 2 <font color='red'>-SLOW / DIES DURING EXECUTION</font>

In [None]:
from tensorflow import keras
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense

# Let's make results reproducable
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Let's try without PCA and standardization
x_train2 = pd.DataFrame(xallpd).to_numpy()
y_train2 = pd.DataFrame(yallpd).to_numpy()

x_test2 = pd.DataFrame(xallpd_test).to_numpy()
y_test2 = pd.DataFrame(yallpd_test).to_numpy()

y_train2[y_train2==1] = 0
y_train2[y_train2==2] = 1
y_test2[y_test2==1] = 0
y_test2[y_test2==2] = 1

early_stopping_monitor = EarlyStopping(patience=5)

model2 = Sequential()
n_cols2 = np.shape(x_train2)[1]
model2.add(Dense(8, activation='relu', input_shape=(n_cols2,)))
model2.add(Dense(4, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("fit begins")
model2.fit(x_train2, y_train2, epochs=32, batch_size=12, validation_split=0.2, callbacks=[early_stopping_monitor],
          use_multiprocessing=True, verbose=0)
print("ennustukset alkaa")
mlppre2 = model2.predict(x_test2)

y_mlppre2 = np.append(mlppre2.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

# Check prints
#print(mlppre2[:10])
#print(y_test2.flatten()[:10])

print(model2.evaluate(x_train2, y_train2)[1])
print(model2.evaluate(x_test2, y_test2)[1])

# Let's save the obtained predictions into a txt file
np.savetxt('mlpclouds2.txt', y_mlppre2, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(mlppre2<0.5))
print(np.count_nonzero(mlppre2>=0.5))

In MLPs there are exceptionally many parameters which have a crucial effect to the performance of the algorithm. You can test how different choices reflect to the execution time and performance level.

Test at least to modify:
- the structure of the neural network (more hidden layers, more / less hidden neurons per layer?)
- number of epochs
- batch_size

Also the choice of optimizer might have a small effect. The other choices should be optimal for this use case.

## Classifier 6: Naive Bayes <font color='green'>-QUICK</font>

In [None]:
from sklearn.naive_bayes import MultinomialNB

# NOTE. Need to be done without PCA and standardization

# Training data
xall3 = pd.DataFrame(xallpd).to_numpy()
indall3 = pd.DataFrame(indpd).to_numpy()
yall3 = pd.DataFrame(yallpd).to_numpy()

# Let's shuffle the data
alltog3 = np.append(xall3, indall3, axis=1)
alltog2 = np.append(alltog3, yall3, axis=1)
np.random.shuffle(alltog2)

xall2 = alltog2[:,:-2]
indall2 = alltog2[:,-2]
yall2 = alltog2[:,-1]

x_train2 = xall2
y_train2 = yall2

# Test data
xall_test3 = pd.DataFrame(xallpd_test).to_numpy()
indall_test3 = pd.DataFrame(indpd_test).to_numpy()
yall_test3 = pd.DataFrame(yallpd_test).to_numpy()

# Let's shuffle the data
alltog_test3 = np.append(xall_test3, indall_test3, axis=1)
alltog_test2 = np.append(alltog_test3, yall_test3, axis=1)
np.random.shuffle(alltog_test2)

xall_test2 = alltog_test2[:,:-2]
indall_test2 = alltog_test2[:,-2]
yall_test2 = alltog_test2[:,-1]

x_test2 = xall_test2
y_test2 = yall_test2

# Let's convert the labels from 1/2 to 0/1
y_train2[y_train2==1] = 0
y_train2[y_train2==2] = 1
y_test2[y_test2==1] = 0
y_test2[y_test2==2] = 1

mnb = MultinomialNB()
mnb.fit(x_train2, y_train2)

mnbpre = mnb.predict(x_test2)

y_mnbpre = np.append(mnbpre.reshape((np.shape(x_test2)[0],1)), indall_test2.reshape((np.shape(x_test2)[0],1)), axis=1)

print(mnb.score(x_train2, y_train2))
print(mnb.score(x_test2, y_test2))

# Let's save the obtained predictions into a txt file
np.savetxt('mnbclouds.txt', y_mnbpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(mnbpre==0))
print(np.count_nonzero(mnbpre==1))

## Classifier 7: Decision tree <font color='green'>-QUICK</font>

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Setting a random state makes results reproducable
tree = DecisionTreeClassifier(random_state=42)
tree.fit(x_train, y_train)

treepre = tree.predict(x_test)

y_treepre = np.append(treepre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

print(tree.score(x_train, y_train))
print(tree.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('treeclouds.txt', y_treepre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(treepre==0))
print(np.count_nonzero(treepre==1))

## Classifier 8: Random forest <font color='orange'>-QUITE QUICK</font>

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Try out different values for n_estimators and max_depth and see how it affects prediction accuracy!
rf = RandomForestClassifier(n_estimators=30, max_depth=9, random_state=42)
rf.fit(x_train, y_train)

rfpre = rf.predict(x_test)

y_rfpre = np.append(rfpre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

print(rf.score(x_train, y_train))
print(rf.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('rfclouds.txt', y_rfpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(rfpre==0))
print(np.count_nonzero(rfpre==1))

## Classifier 9: Voting Classifier (ensemble learning) <font color='green'>-QUICK</font>

In [None]:
from sklearn.ensemble import VotingClassifier

# Try different model combinations and see how it affects prediction accuracy!
evc = VotingClassifier(estimators=[('rire',rire),('logre',logre),('svm',svm)], voting='hard')
evc.fit(x_train, y_train)

evcpre = evc.predict(x_test)

y_evcpre = np.append(evcpre.reshape((np.shape(x_test)[0],1)), indall_test.reshape((np.shape(x_test)[0],1)), axis=1)

print(evc.score(x_train, y_train))
print(evc.score(x_test, y_test))

# Let's save the obtained predictions into a txt file
np.savetxt('evcclouds.txt', y_evcpre, delimiter=',')

In [None]:
# Let's check that we are not predicting always the same label
print(np.count_nonzero(evcpre==0))
print(np.count_nonzero(evcpre==1))