In [1]:
from __future__ import print_function, division
%matplotlib inline

# import used packages and modules
# import display function to display the table of the pandas dataframe
from IPython.display import display

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.model_selection import LeaveOneOut, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, confusion_matrix

from scipy import signal

from hmmlearn import hmm
from seqlearn.hmm import MultinomialHMM

import datetime
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import csv







# Importing Data

In [180]:
train_manchester = pd.read_csv('../data/misc/Train_Manchester_Northwich.csv')
london_data = pd.read_csv('../data/london_data/london_data.csv')

data04 = pd.read_csv('../data/meadows_december/meadows-2017-12-04.csv')
data05 = pd.read_csv('../data/meadows_december/meadows-2017-12-05.csv')
data06 = pd.read_csv('../data/meadows_december/meadows-2017-12-06.csv')
data07 = pd.read_csv('../data/meadows_december/meadows-2017-12-07.csv')

bike_data_old = pd.read_csv('../data/misc/2015061911.csv')
bike_data_old2 = pd.read_csv('../data/misc/2015061817.csv')
bike_data_feb = pd.read_csv('../data/misc/bike_feb.csv')

# Merging Data

In [150]:
bin_vals = ['bin'+str(x) for x in range(0,16)]
pm_vals = ['pm1', 'pm2_5', 'pm10']

In [185]:
# environment index is 0 as all data from these datasets is walking data
data04['environment_index'] = 0
data05['environment_index'] = 0
data06['environment_index'] = 0
data07['environment_index'] = 0

# environment index is 3 for bike data
bike_data_old['environment_index'] = 3
bike_data_old2['environment_index'] = 3
bike_data_feb['environment_index'] = 3

# rename columns
bike_data_old2 = bike_data_old2.rename(
    index=str, columns={"latitude":'gpsLatitude', "longitude":'gpsLongitude'})

labels = bin_vals + pm_vals + ['environment_index', 'gpsLatitude', 'gpsLongitude', 'humidity']

data = pd.concat([london_data[labels],
                  train_manchester[labels], 
                  data04[labels], 
                  data05[labels],
                  data06[labels],
                  data07[labels],
                  bike_data_old[labels],
                  bike_data_old2[labels],
                  bike_data_feb[labels]], ignore_index=True)

# remove indoor labelled data for this set of experiments
data = data[data['environment_index']!=7]
data = data[data['environment_index']!=5]
data = data[data['humidity']>0]

### Data Information

In [186]:
data.describe()

Unnamed: 0,bin0,bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8,bin9,...,bin13,bin14,bin15,pm1,pm2_5,pm10,environment_index,gpsLatitude,gpsLongitude,humidity
count,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,...,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0,3970.0
mean,417.945088,105.518136,62.766247,37.194458,13.818388,12.804786,7.402519,3.714106,4.95995,3.42267,...,0.219647,13.790176,8.835264,4.445759,9.611883,22.207614,1.902519,54.685102,-2.59924,53.165118
std,941.266986,243.487853,139.392366,74.11423,27.341315,25.552281,27.309803,56.265748,160.067557,151.109326,...,1.506387,682.987493,379.290387,9.143567,12.67498,55.742419,1.460945,5.859238,1.066219,9.991877
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-3.258313,17.6
25%,14.0,4.0,3.0,2.0,1.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.265332,3.417454,5.49504,0.0,55.83509,-3.193039,46.299999
50%,135.0,46.0,32.0,19.0,6.0,6.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,2.160299,6.198611,9.479655,2.0,55.942776,-3.186589,53.696285
75%,469.75,114.0,73.0,47.0,17.0,16.0,8.0,2.0,1.0,1.0,...,0.0,0.0,0.0,4.486784,11.249292,19.693625,3.0,55.945289,-2.338986,57.602979
max,12336.0,4758.0,3710.0,1997.0,538.0,434.0,656.0,3343.0,7955.0,9500.0,...,29.0,40962.0,16999.0,108.47161,158.40524,1051.750977,6.0,56.008247,-0.100202,89.8


### Labels

In [187]:
labels_idx = np.array([0, 1, 2, 3, 4, 6])

# Kmeans

In [188]:
# method to get the cluster means of the data points based on coordinates
# and the queried columns
def get_location_cluster_means(data, cluster_no, cols):
    # obtain indices of clusters
    kmeans = KMeans(n_clusters=cluster_no, random_state=0)
    # remove data rows with null coordinates
    data = data[~np.isnan(data['gpsLatitude'])]
    # get the indices of the location based clusters
    indices = kmeans.fit_predict(data[['gpsLatitude', 'gpsLongitude']])
    freqs = np.bincount(indices)
    # initialise means of queried columns
    means = np.zeros((cluster_no, len(cols)))
    # assign clustered_indices to dataframe
    data['clustered_index'] = indices
    # compute means of queried columns
    for index in np.unique(indices):
        means[index] = np.mean(data[data['clustered_index'] == index][cols])
    return means, data

In [189]:
# method to get the clusters of the environments corresponding to the 5 environments based on
# the queried columns and number of location clusters
def get_environment_clusters(data, cluster_no, cols):
    means, d = get_location_cluster_means(data, cluster_no, cols)
    # we have 5 clusters corresponding to 6 different types of transport
    environment_kmeans = KMeans(n_clusters=6, random_state=0)
    # predict the cluster indices
    environment_indices = environment_kmeans.fit_predict(means)
    # sort the indices based on the means of the clusters
    idx = np.argsort(environment_kmeans.cluster_centers_.sum(axis=1))
    lut = np.zeros_like(idx)
    lut[idx] = np.arange(6)
    # append the indices to the dataframe
    d['unsupervised_environment_index'] = lut[environment_indices][d['clustered_index']]
    return d

# Stats Methods

In [190]:
def plot_confusion_matrix(labels, predictions, title='Confusion matrix', 
                          classes=['on_foot', 'car', 'train', 'bike', 'bus', 'underground']):
    """Plots a confusion matrix."""
    cmatrix = np.array(confusion_matrix(labels, predictions))
    # normalize confusion matrix
    cm = cmatrix/cmatrix.sum(axis=1)[:, np.newaxis]
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True)
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True data')
    plt.xlabel('Predicted data')
    plt.show()

In [208]:
def get_training_accuracy_with_kfolds_iloc(estimator, x_tr, y_tr, kf):
    score_array = np.empty(kf.n_folds)
    
    for (idx, (train_feature, test_feature)) in enumerate(kf):
        estimator.fit(x_tr.iloc[train_feature], y_tr.iloc[train_feature])
        y_pred = estimator.predict(x_tr.iloc[test_feature])
        cm = confusion_matrix(y_tr.iloc[test_feature], y_pred)
        cm_norm = cm.astype('float')/cm.sum(axis=1)
        score_array[idx] = accuracy_score(y_tr.iloc[test_feature], y_pred, normalize=True)
        
#         plot_confusion_matrix(y_tr.iloc[test_feature], y_pred)
        
    return np.mean(score_array)

In [192]:
def get_training_accuracy_mixed_models(estimators, x_tr, y_tr, kf):
    score_array = np.empty(kf.n_folds)
    
    for (idx, (train_feature, test_feature)) in enumerate(kf):
        probs_array = np.empty((len(estimators), x_tr.iloc[test_feature].shape[0], 6))

        for estimator_idx, estimator in enumerate(estimators):
            estimator.fit(x_tr.iloc[train_feature], y_tr.iloc[train_feature])
            probs_array[estimator_idx] = estimator.predict_proba(x_tr.iloc[test_feature])
        
        final_labels = labels_idx[np.argmax(np.mean(probs_array, axis=0), axis=1)]
        score_array[idx] = accuracy_score(y_tr.iloc[test_feature], final_labels, normalize=True)
    
    return np.mean(score_array)

# Best Models

In [209]:
clustered_data = get_environment_clusters(data, 40, bin_vals)

clustered_normalised_data = clustered_data[clustered_data['bin0']>0].copy().reindex()
clustered_normalised_data[bin_vals] = clustered_normalised_data[bin_vals].apply(lambda row: row/np.sum(row), axis=1)

rf = RandomForestClassifier(random_state=0, n_estimators=260, n_jobs=-1)
kf = KFold(len(clustered_normalised_data), n_folds=5, shuffle=True, random_state=0)

print("Accuracy:", get_training_accuracy_with_kfolds_iloc(rf, clustered_normalised_data[['unsupervised_environment_index', 'humidity']+bin_vals], 
                                                          clustered_normalised_data['environment_index'], kf))

Accuracy: 0.954971274944


### Removing Underground Data

In [194]:
filtered_data = data[data['environment_index']!=6]

clustered_data = get_environment_clusters(filtered_data, 40, bin_vals)

clustered_normalised_data = clustered_data[clustered_data['bin0']>0].copy().reindex()
clustered_normalised_data[bin_vals] = clustered_normalised_data[bin_vals].apply(lambda row: row/np.sum(row), axis=1)

rf = RandomForestClassifier(random_state=0, n_estimators=260, n_jobs=-1)
kf = KFold(len(clustered_normalised_data), n_folds=5, shuffle=True, random_state=0)

print("Accuracy:", get_training_accuracy_with_kfolds_iloc(rf, clustered_normalised_data[['unsupervised_environment_index', 'humidity']+bin_vals], 
                                                          clustered_normalised_data['environment_index'], kf))

Accuracy: 0.964412703691


### Neural Network

In [196]:
# environment clustering on raw bin values: 3 hidden layers
clustered_data = get_environment_clusters(data, 40, bin_vals)

clustered_normalised_data = clustered_data[clustered_data['bin0']>0].copy().reindex()
clustered_normalised_data[bin_vals] = clustered_normalised_data[bin_vals].apply(lambda row: row/np.sum(row), axis=1)

clf = MLPClassifier(solver='lbfgs', 
                    alpha=1e-5, 
                    hidden_layer_sizes=(20,), 
                    random_state=1, 
                    max_iter=500)
kf = KFold(len(clustered_normalised_data), n_folds=5, shuffle=True, random_state=0)

print("Accuracy:", get_training_accuracy_with_kfolds_iloc(clf, clustered_normalised_data[['unsupervised_environment_index']+bin_vals], 
                                                          clustered_normalised_data['environment_index'], kf))

Accuracy: 0.822659988158


### KNN

In [197]:
clustered_data = get_environment_clusters(data, 40, bin_vals)

clustered_normalised_data = clustered_data[clustered_data['bin0']>0].copy().reindex()
clustered_normalised_data[bin_vals] = clustered_normalised_data[bin_vals].apply(lambda row: row/np.sum(row), axis=1)

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
kf = KFold(len(clustered_normalised_data), n_folds=5, shuffle=True, random_state=0)

print("Accuracy:", get_training_accuracy_with_kfolds_iloc(knn, clustered_normalised_data[['unsupervised_environment_index', 'humidity']+bin_vals], 
                                                          clustered_normalised_data['environment_index'], kf))

Accuracy: 0.849734513274


### Mixed Model

In [198]:
kf = KFold(len(clustered_normalised_data), n_folds=5, shuffle=True, random_state=0)

get_training_accuracy_mixed_models([knn, rf, clf], clustered_normalised_data[['unsupervised_environment_index', 'humidity']+bin_vals], 
                                                          clustered_normalised_data['environment_index'], kf)

0.9256256301108996

### Validating Models

#### Bike validation data

In [211]:
bike_data_validation = pd.read_csv('../data/misc/20150618.csv')
bike_data_validation['environment_index'] = 3

bike_data_validation = bike_data_validation.rename(index=str, columns={"latitude":'gpsLatitude', "longitude":'gpsLongitude'})
display(bike_data_validation.describe())

Unnamed: 0,time,gpsLatitude,gpsLongitude,bin0,bin1,bin2,bin3,bin4,bin5,bin6,...,opctemp,temperature,humidity,alphaVersion,urban_space,o3_ae,o3_we,no2_ae,no2_we,environment_index
count,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,...,0.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0
mean,20150620000000.0,50.514721,-2.980726,9.733679,1.862176,1.346114,0.905699,0.656995,1.06114,0.613472,...,,20.137565,43.524566,2.0,-0.095337,-7284.846632,-5096.067358,-10171.590674,-7158.012435,3.0
std,559.2194,16.731824,0.643371,6.295997,1.64628,1.245905,1.012586,0.832101,1.103273,0.803793,...,,1.031035,1.80822,0.0,0.293832,11283.184351,11737.250969,10599.945141,11277.60443,0.0
min,20150620000000.0,-1.0,-3.203215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,19.16,39.69568,2.0,-1.0,-30686.0,-29406.0,-32734.0,-30942.0,3.0
25%,20150620000000.0,55.940717,-3.194135,6.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,19.46,41.923517,2.0,0.0,-16094.0,-15070.0,-17630.0,-16094.0,3.0
50%,20150620000000.0,55.943865,-3.186613,9.0,2.0,1.0,1.0,0.0,1.0,0.0,...,,19.76,44.26632,2.0,0.0,-2782.0,1059.0,-6878.0,-2270.0,3.0
75%,20150620000000.0,55.94524,-3.181847,13.0,3.0,2.0,1.0,1.0,2.0,1.0,...,,20.2,44.978877,2.0,0.0,2339.0,4387.0,-3038.0,2339.0,3.0
max,20150620000000.0,55.946557,-1.0,119.0,10.0,7.0,6.0,5.0,6.0,4.0,...,,23.24,46.238251,2.0,0.0,9251.0,10531.0,6947.0,8227.0,3.0


In [212]:
clustered_validation_data = get_environment_clusters(bike_data_validation, 40, bin_vals)

clustered_normalised = clustered_validation_data[clustered_validation_data['bin0']>0].copy().reindex()
clustered_normalised[bin_vals] = clustered_normalised[bin_vals].apply(lambda row: row/np.sum(row), axis=1)

predictions = rf.predict(clustered_normalised[bin_vals+['unsupervised_environment_index', 'humidity']])

print("Validation accuracy:", accuracy_score(clustered_normalised['environment_index'], predictions))

plot_confusion_matrix(clustered_normalised['environment_index'], predictions, 
                      classes=['on_foot', 'car', 'bike', 'train', 'bus', 'underground'])

Validation accuracy: 0.976817702845


#### Walking validation data