In [1]:
from osgeo import gdal
import ogr
import matplotlib.pyplot as plt
import numpy as np
import fiona
import xgboost
import pandas as pd

# Set seed
np.random.seed(100)

# Read files 
trainX = np.load('/scratch/slums/bl-slums/gt/final-px-tr-6-Xa')
trainY = np.load('/scratch/slums/bl-slums/gt/final-px-tr-6-Ya')
testX = np.load('/scratch/slums/bl-slums/gt/final-px-te-6-Xa')
testY = np.load('/scratch/slums/bl-slums/gt/final-px-te-6-Ya')

trainY = trainY.ravel()
testY = testY.ravel()
print trainX.shape, trainY.shape, testX.shape, testY.shape



(3549, 18) (3549,) (779, 18) (779,)




In [2]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from matplotlib import pyplot
model = xgboost.XGBClassifier(nthread=16, objective='multi:softmax' )
learning_rate = [0.2, 0.3, 0.5, 0.7, 0.9]
n_estimators = [1000,5000,6000, 7000]
param_grid = dict(learning_rate=learning_rate, n_estimators = n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(trainX, trainY)
report(grid_result.cv_results_)


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
xgb = xgboost.XGBClassifier(max_depth=500, n_estimators=1000, nthread=8, objective='multi:softmax', learning_rate = 0.2 )

xgb.fit(trainX,trainY)
result = xgb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-xgboost-6-Cl-model.sav'
pickle.dump(xgb, open(fname, 'wb'))

In [None]:
from sklearn.ensemble import RandomForestClassifier

#kfold = KFold(n_splits = 10, random_state=100)
rf = RandomForestClassifier(n_estimators = 500)
#results = cross_val_score(rf,trainX, trainY, cv=kfold)
#print results.mean(), results.std()
param_grid = {"max_depth": [3, None],
              "min_samples_split": [2, 3],
              "min_samples_leaf": [1, 3],
              "n_estimators": [500, 1000],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
grid_search = GridSearchCV(rf, param_grid = param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(trainX, trainY)
report(grid_result.cv_results_)



In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='entropy',n_estimators=1000,min_samples_split = 2, max_depth = None, n_jobs=-1, bootstrap = False, min_samples_leaf = 1)
rf.fit(trainX,trainY)
result = rf.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-rf-6-Cl-model.sav'
pickle.dump(rf, open(fname, 'wb'))

Overall accuracy = 0.741976893453

Confusion Matrix 
 [[  0   1  13   0  25   0]
 [  0  19   6   0  35   0]
 [  3   0  35   4  37   0]
 [  0   0  23   5  28   1]
 [  0   7  10   0 184   1]
 [  0   0   1   0   6 335]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.00      0.00      0.00        39
         S2       0.70      0.32      0.44        60
         S3       0.40      0.44      0.42        79
         S4       0.56      0.09      0.15        57
          F       0.58      0.91      0.71       202
          O       0.99      0.98      0.99       342

avg / total       0.72      0.74      0.71       779




In [4]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(trainX,trainY)
result = nb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-gnb-6-Cl-model.sav'
pickle.dump(nb, open(fname, 'wb'))

Overall accuracy = 0.613607188703

Confusion Matrix 
 [[  5   3  28   0   3   0]
 [  1  30  25   0   4   0]
 [ 12   4  57   4   2   0]
 [  8   3  36   7   1   2]
 [  5  57  75   6  57   2]
 [  0   0   1   2  17 322]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.16      0.13      0.14        39
         S2       0.31      0.50      0.38        60
         S3       0.26      0.72      0.38        79
         S4       0.37      0.12      0.18        57
          F       0.68      0.28      0.40       202
          O       0.99      0.94      0.96       342

avg / total       0.69      0.61      0.62       779




In [5]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=100)
dt.fit(trainX,trainY)
result = dt.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-dt-6-Cl-model.sav'
pickle.dump(dt, open(fname, 'wb'))

Overall accuracy = 0.689345314506

Confusion Matrix 
 [[  5   6   9   0  19   0]
 [  0  22   9   1  28   0]
 [  3   7  35   6  28   0]
 [  4   4  25   9  14   1]
 [  8  24  26   4 139   1]
 [  0   0   0   2  13 327]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.25      0.13      0.17        39
         S2       0.35      0.37      0.36        60
         S3       0.34      0.44      0.38        79
         S4       0.41      0.16      0.23        57
          F       0.58      0.69      0.63       202
          O       0.99      0.96      0.97       342

avg / total       0.69      0.69      0.68       779




In [7]:

from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(trainX, trainY)
result= knn.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-knn-6-Cl-model.sav'
pickle.dump(knn, open(fname, 'wb'))


Overall accuracy = 0.658536585366

Confusion Matrix 
 [[  0   1  11   0  26   1]
 [  1   4  13   0  37   5]
 [  1   0  22   0  55   1]
 [  0   4  20   1  30   2]
 [  2   8  28   1 152  11]
 [  0   0   1   0   7 334]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.00      0.00      0.00        39
         S2       0.24      0.07      0.10        60
         S3       0.23      0.28      0.25        79
         S4       0.50      0.02      0.03        57
          F       0.50      0.75      0.60       202
          O       0.94      0.98      0.96       342

avg / total       0.62      0.66      0.61       779




In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
params_grid={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(100,100,100,100), (100,100,100,100,100), (100,100,100,100,100,100,100,100)],
'alpha': [0.0001, 0.00001, 0.01],
'activation': ["logistic", "relu", "tanh"]
}

mlp_grid_search = GridSearchCV(estimator=mlp,param_grid=params_grid,n_jobs=-1,cv=kfold)
mlp_grid_result = mlp_grid_search.fit(trainX, trainY)
report(mlp_grid_result.cv_results_)


In [9]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes = (100,100,100,100,100), activation = 'logistic', learning_rate = 'adaptive', alpha = 0.00001)
mlp.fit(trainX, trainY)
result = mlp.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-mlp-6-Cl-model.sav'
pickle.dump(mlp, open(fname, 'wb'))

Overall accuracy = 0.684210526316

Confusion Matrix 
 [[  0   0   0   0  37   2]
 [  0   0   0   0  57   3]
 [  0   0   0   0  79   0]
 [  0   0   0   0  56   1]
 [  0   0   0   0 198   4]
 [  0   0   0   0   7 335]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.00      0.00      0.00        39
         S2       0.00      0.00      0.00        60
         S3       0.00      0.00      0.00        79
         S4       0.00      0.00      0.00        57
          F       0.46      0.98      0.62       202
          O       0.97      0.98      0.98       342

avg / total       0.54      0.68      0.59       779




  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
adb = AdaBoostClassifier()
params_grid = dict(n_estimators=[50,100,500,1000,5000], learning_rate=[0.01, 0.007, 0.0001, 0.1, 0.0007])
adb_grid_search = GridSearchCV(estimator=adb, param_grid = params_grid, cv=kfold)
adb_search_result = adb_grid_search.fit(trainX, trainY)
report(adb_search_result.cv_results_)

In [10]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators = 100,learning_rate = 0.007)
adb.fit(trainX, trainY)
result = adb.predict(testX)
acc = accuracy_score(testY, result)
cm = confusion_matrix(testY, result)
cr = classification_report(testY,result, target_names=['S1','S2','S3','S4','F','O'])
print 'Overall accuracy = {}\n'.format(acc)
#print 'Slum accuracy = {}\n'.format(cm[0,0]/np.sum(cm[0,:]))
print 'Confusion Matrix \n {}\n'.format(cm)
print 'Classification Report \n {}\n'.format(cr)

fname = 'VHR-adaboost-6-Cl-model.sav'
pickle.dump(adb, open(fname, 'wb'))

Overall accuracy = 0.662387676508

Confusion Matrix 
 [[  0   0   0   0  39   0]
 [  0   0   0   0  60   0]
 [  0   0   0   0  79   0]
 [  0   0   0   0  57   0]
 [  0   0   0   0 202   0]
 [  0   0   0   0  28 314]]

Classification Report 
              precision    recall  f1-score   support

         S1       0.00      0.00      0.00        39
         S2       0.00      0.00      0.00        60
         S3       0.00      0.00      0.00        79
         S4       0.00      0.00      0.00        57
          F       0.43      1.00      0.61       202
          O       1.00      0.92      0.96       342

avg / total       0.55      0.66      0.58       779


