This notebook carries out Naive Bayes classfication on the dataset where the model is trained and validated on data from Paris but tested on data from London.

In [1]:
#Importing libraries

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.cross_validation import train_test_split

In [2]:
#Loading the labelled datasets

train = pd.read_csv('Data1/Data1_train.csv')
validation = pd.read_csv('Data1/Data1_validation.csv')
train.head()

Unnamed: 0,mean0,mean1,mean2,mean3,mean4,mean5,mean6,mean7,mean8,mean9,...,var_diff11,var_diff12,var_diff13,var_diff14,var_diff15,var_diff16,var_diff17,var_diff18,var_diff19,scenes
0,0.575796,0.717401,0.539698,-1.06237,0.191049,-1.723536,-1.194319,0.053656,-0.20937,-0.200844,...,-0.655437,-0.839057,-0.300258,-0.830228,-0.783522,-0.736217,-0.683096,-0.866754,-0.571898,tubestation
1,-0.626746,1.382491,0.447212,-1.766357,0.009479,2.536803,0.380988,1.120369,3.892846,4.094601,...,-0.906597,-0.561817,-0.990822,-0.910512,-0.77032,-1.019891,-1.231433,-0.857304,-0.888658,train-ter
2,-0.484043,0.597128,1.028187,-1.412017,1.534679,0.511047,1.286969,0.967864,-0.259907,0.790211,...,1.939481,2.26083,2.335424,2.172651,2.198847,2.151799,1.910007,1.362777,1.451639,bus
3,0.308431,-0.005826,-0.932353,0.143836,-0.61964,0.029328,-0.851747,-0.587147,0.131899,-0.957469,...,0.320351,0.050667,-0.057665,0.239617,0.260419,0.66899,0.290236,0.469636,0.520772,market
4,-1.846697,-1.260525,1.77926,0.112545,0.020945,0.628017,0.52307,1.356682,1.079687,0.648798,...,3.062474,0.293887,3.250679,-0.14649,2.59112,-0.208007,0.856651,0.327243,-0.24614,train-ter


In [3]:
#Concatenating the 2 sets to form ONE TRAINING SET
trainCombined = pd.concat([train, validation], axis=0)

# Splitting train and test to take the first 10 features only
trainSet = trainCombined.iloc[:,:10]

In [4]:
#Not required, just getting the names

train.scenes.unique()

array(['tubestation', 'train-ter', 'bus', 'market', 'restaurant',
       'busystreet', 'quietstreet'], dtype=object)

In [5]:
# Getting the corresponding Y scenes(text)

Y_labels = trainCombined.scenes
Y_labels[:15]

0     tubestation
1       train-ter
2             bus
3          market
4       train-ter
5     tubestation
6       train-ter
7             bus
8      restaurant
9      busystreet
10     busystreet
11     busystreet
12    tubestation
13     busystreet
14    tubestation
Name: scenes, dtype: object

In [6]:
#The function that assigns numbers to our categories

def numericLabels(x):
     return {
        ourLabels[0]: 1,
        ourLabels[1]: 2,
        ourLabels[2]: 3,
        ourLabels[3]: 4,
        ourLabels[4]: 5,
        'unknown': 6,
    }[x]

In [7]:
#The function that assigns numerical values to our labels
ourLabels = ['tubestation', 'quietstreet', 'busystreet', 'restaurant', 'market']

def manageLabels(labelsText, labelsNum):
    i = 0;
    while i < labelsText.size:
        if labelsText[i] not in ourLabels:
            labelsText.replace(labelsText[i],'unknown',inplace=True)
        labelsNum[i] = numericLabels(labelsText[i])
        i += 1


In [8]:
validationSize = [0.30, 0.28, 0.26, 0.24, 0.22, 0.20]
softMargin = [1.5, 2.5]

for i in validationSize:
    #splitting the dataset
    X_train, X_test, Y_train, Y_test = train_test_split(trainSet, Y_labels, test_size=i, random_state=2891)
    
    #resetting indices for the labelled sets so that they work with the pre written functions
    Y_train.reset_index(drop=True, inplace=True)
    Y_test.reset_index(drop=True, inplace=True)
    
    
    #Converting the labels to numerical values
    Y_train1 = Y_train
    Y_test1 = Y_test
    manageLabels(Y_train, Y_train1)
    manageLabels(Y_test, Y_test1)
    
    #converting type of Y to int
    Y_train1 = Y_train1.astype('int64')
    Y_test1 = Y_test1.astype('int64')
    
    #Readability shenanigans
    testing = 100 * i
    training = 100 - testing
    print '\n\n **** For %d/%d data split ratio **** : \n' %(training, testing)
    
    #Train the model (Poly with degree=3)
    for kernel in ('linear', 'poly', 'rbf'):
        clf = svm.SVC(kernel=kernel, C=2.5, degree=3)
        clf.fit(X_train, Y_train1)
        print "We successfully predict {0}% of data using {1} kernel for the training data".format(100-abs(clf.predict(X_train)-Y_train1).sum()/len(Y_train1), kernel)
    
    #Fit the model (Poly with degree=3, C=1.5 and 2.5)
    for c in softMargin:
        print '\n With C = %.1f' %c   
        for kernel in ('linear', 'poly', 'rbf'):
            clf = svm.SVC(kernel=kernel, C=c, degree=3)
            clf.fit(X_test, Y_test1)
            correct=1.0*(clf.predict(X_test)==np.asarray(Y_test1)).sum()/len(Y_test1)
            print "We successfully predict {0}% of data using {1} kernel for the test data".format((correct)*100, kernel)



 **** For 70/30 data split ratio **** : 

We successfully predict 100% of data using linear kernel for the training data
We successfully predict 100% of data using poly kernel for the training data
We successfully predict 100% of data using rbf kernel for the training data

 With C = 1.5
We successfully predict 84.4497607656% of data using linear kernel for the test data
We successfully predict 85.1674641148% of data using poly kernel for the test data
We successfully predict 91.1483253589% of data using rbf kernel for the test data

 With C = 2.5
We successfully predict 84.9282296651% of data using linear kernel for the test data
We successfully predict 88.5167464115% of data using poly kernel for the test data
We successfully predict 92.8229665072% of data using rbf kernel for the test data


 **** For 72/28 data split ratio **** : 

We successfully predict 100% of data using linear kernel for the training data
We successfully predict 100% of data using poly kernel for the training

### Since SVM performed best on the SOC data, the final test set is used on it.

In [9]:
test = pd.read_csv('Data1/Data1_test.csv')

In [10]:
testSet = test.iloc[:,:10]

In [11]:
test.scenes.unique()

array(['bus', 'busystreet', 'office', 'openairmarket', 'park',
       'quietstreet', 'restaurant', 'supermarket', 'tube', 'tubestation'], dtype=object)

In [12]:
Y_labels = test.scenes
Y_labels[:15]

0            bus
1            bus
2            bus
3            bus
4            bus
5            bus
6            bus
7            bus
8            bus
9            bus
10    busystreet
11    busystreet
12    busystreet
13    busystreet
14    busystreet
Name: scenes, dtype: object

In [13]:
Y_labels.reset_index(drop=True, inplace=True)

#Converting the labels to numerical values
Y_test = Y_labels
manageLabels(Y_test, Y_labels)

#converting type of Y to int
Y_test = Y_test.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
clf = svm.SVC(kernel='poly', C=2.5, degree=3)
clf.fit(X_train, Y_train1)
correct=1.0*(clf.predict(testSet)==np.asarray(Y_test)).sum()/len(Y_test)
print "We successfully predict {0}% of data using {1} kernel for the test data".format((correct)*100, 'poly')

We successfully predict 37.5% of data using poly kernel for the test data
