In [None]:
from osgeo import gdal
import ogr
import matplotlib.pyplot as plt
import numpy as np
import fiona
import xgboost
import pandas as pd

# Set seed
np.random.seed(100)

# Read files 
trainX = np.load('/scratch/slums/bl-slums/gt/final-px-tr-2-Xa')
trainY = np.load('/scratch/slums/bl-slums/gt/final-px-tr-2-Ya')
testX = np.load('/scratch/slums/bl-slums/gt/final-px-te-2-Xa')
testY = np.load('/scratch/slums/bl-slums/gt/final-px-te-2-Ya')

trainY = trainY.ravel()
testY = testY.ravel()
print trainX.shape, trainY.shape, testX.shape, testY.shape

In [None]:
trainX = np.nan_to_num(trainX)
testX = np.nan_to_num(testX)

In [None]:
# Utility function to report best scores
# Source: scikit-learn tutorials
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
#XGBOOST
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from matplotlib import pyplot
model = xgboost.XGBClassifier(nthread=16 ,objective='binary:logistic')
learning_rate = [0.2, 0.3, 0.5, 0.7, 0.9]
n_estimators = [1000,5000,6000, 7000, 10000]
param_grid = dict(learning_rate=learning_rate, n_estimators = n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(trainX, trainY)
report(grid_result.cv_results_)


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
#kfold = StratifiedKFold(n_splits = 10, random_state=7)
xgb = xgboost.XGBClassifier(max_depth=500, n_estimators=1000, nthread=8 , objective='binary:logistic', learning_rate = 0.7 )
#results = cross_val_score(xgb, trainX, trainY, cv=kfold)
#print(results)
xgb.fit(trainX,trainY)
result = xgb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-xgboost-2-Cl-model.sav'
pickle.dump(xgb, open(fname, 'wb'))

In [None]:
# RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


rf = RandomForestClassifier(n_estimators = 500)
param_grid = {"max_depth": [3, None],
              "min_samples_split": [2, 3],
              "min_samples_leaf": [1, 3],
              "n_estimators": [500, 1000],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
grid_search = GridSearchCV(rf, param_grid = param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(trainX, trainY)
report(grid_result.cv_results_)



In [None]:
rf = RandomForestClassifier(n_estimators = 500, bootstrap = False, min_samples_leaf = 1, min_samples_split = 2, max_depth = None)
rf.fit(trainX,trainY)
result = rf.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)
fname = 'VHR-rf-2-Cl-model.sav'
pickle.dump(rf, open(fname, 'wb'))


In [None]:
# NAIVE BAYES
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(trainX,trainY)
result = nb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-gnb-2-Cl-model.sav'
pickle.dump(nb, open(fname, 'wb'))

In [None]:
# DECISION TREE
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=100)
dt.fit(trainX,trainY)
result = dt.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-dt-2-Cl-model.sav'
pickle.dump(dt, open(fname, 'wb'))

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
cv = StratifiedKFold(n_splits = 10, random_state =7 )
knn = KNeighborsClassifier()
n_neighbors = list(np.arange(3,11,1))
#print n_neighs
params_grid = dict(n_neighbors= n_neighbors)
knn_grid_search = GridSearchCV(estimator = knn, n_jobs = -1, param_grid = params_grid)
knn_grid_result = knn_grid_search.fit(trainX, trainY)
report(knn_grid_result.cv_results_)

knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(trainX, trainY)
result= knn.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-knn-2-Cl-model.sav'
pickle.dump(knn, open(fname, 'wb'))


In [None]:
# MLP
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
params_grid={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(100,100,100,100), (100,100,100,100,100), (100,100,100,100,100,100,100,100)],
'alpha': [0.0001, 0.00001, 0.01],
'activation': ["logistic", "relu", "tanh"]
}

mlp_grid_search = GridSearchCV(estimator=mlp,param_grid=params_grid,n_jobs=-1,cv=kfold)
mlp_grid_result = mlp_grid_search.fit(trainX, trainY)
report(mlp_grid_result.cv_results_)


In [None]:
mlp = MLPClassifier(activation='logistic', learning_rate = 'constant',hidden_layer_sizes=(100, 100, 100, 100), alpha = 0.00001)
mlp.fit(trainX, trainY)
result = mlp.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-mlp-2-Cl-model.sav'
pickle.dump(mlp, open(fname, 'wb'))

In [None]:
# ADABOOST
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
adb = AdaBoostClassifier()
params_grid = dict(n_estimators=[50,100,500,1000,5000], learning_rate=[0.01, 0.007, 0.0001, 0.1, 0.0007])
adb_grid_search = GridSearchCV(estimator=adb, param_grid = params_grid, cv=kfold)
adb_search_result = adb_grid_search.fit(trainX, trainY)


In [None]:
report(adb_search_result.cv_results_)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
adb = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
adb.fit(trainX, trainY)
result = adb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['Building','Other'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-adaboost-2-Cl-model.sav'
pickle.dump(adb, open(fname, 'wb'))