In [1]:
#Importing libraries

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.cross_validation import train_test_split

In [2]:
#Loading the labelled datasets

train = pd.read_csv('Data2/Data2_train.csv')
validation = pd.read_csv('Data2/Data2_validation.csv')
train.head()

Unnamed: 0,mean0,mean1,mean2,mean3,mean4,mean5,mean6,mean7,mean8,mean9,...,var_diff11,var_diff12,var_diff13,var_diff14,var_diff15,var_diff16,var_diff17,var_diff18,var_diff19,scenes
0,0.093323,-0.399723,0.783561,-2.520351,2.564316,-0.604616,1.020544,1.818084,-2.152912,1.866948,...,-0.980404,-0.94053,-1.060487,-0.91527,-0.933561,-1.192804,-1.120846,-1.367337,-1.184519,train-ter
1,0.076053,1.165681,0.960745,-1.856342,0.366469,1.074824,0.338721,1.890659,3.694118,2.813581,...,-0.824649,-0.92521,-0.656642,-0.723337,-0.911938,-0.838659,-0.662626,-0.954069,-0.782338,train-ter
2,0.855206,1.212459,-0.488913,0.914367,-0.056152,-0.089688,1.16052,0.783487,0.425175,-0.174028,...,-0.758511,-0.88437,-1.231002,-0.600003,-0.559877,-0.837631,-0.818048,-0.670121,-0.492715,busystreet
3,0.347927,-1.2414,-0.085663,1.107282,0.342155,-0.15128,-0.556604,0.207628,-0.366518,-0.758259,...,1.683114,1.93185,1.657565,1.608029,1.631192,1.646374,1.472824,1.409602,1.616908,restaurant
4,0.752063,-1.192575,-0.296607,-0.540139,0.113057,-1.612495,-0.691072,-0.810711,-1.709526,-0.551167,...,0.032911,0.433523,0.657787,0.347634,0.597555,0.455884,0.332098,0.407437,0.221523,market


In [3]:
#Concatenating the 2 sets to form ONE TRAINING SET
trainCombined = pd.concat([train, validation], axis=0)

# Splitting train and test to take the first 10 features only
trainSet = trainCombined.iloc[:,:10]

In [4]:
#Not required, just getting the names

train.scenes.unique()

array(['train-ter', 'busystreet', 'restaurant', 'market', 'quietstreet',
       'bus', 'tubestation', 'tube', 'supermarket', 'park', 'office',
       'openairmarket'], dtype=object)

In [5]:
# Getting the corresponding Y scenes(text)

Y_labels = trainCombined.scenes
Y_labels[:15]

0       train-ter
1       train-ter
2      busystreet
3      restaurant
4          market
5      busystreet
6          market
7       train-ter
8          market
9      restaurant
10     restaurant
11      train-ter
12     busystreet
13    quietstreet
14            bus
Name: scenes, dtype: object

In [6]:
#The function that assigns numbers to our categories

def numericLabels(x):
     return {
        ourLabels[0]: 1,
        ourLabels[1]: 2,
        ourLabels[2]: 3,
        ourLabels[3]: 4,
        ourLabels[4]: 5,
        ourLabels[5]: 5,
        'unknown': 6,
    }[x]

In [7]:
#The function that assigns numerical values to our labels
ourLabels = ['tubestation', 'quietstreet', 'busystreet', 'restaurant', 'market', 'openairmarket']

def manageLabels(labelsText, labelsNum):
    i = 0;
    while i < labelsText.size:
        if labelsText[i] not in ourLabels:
            labelsText.replace(labelsText[i],'unknown',inplace=True)
        labelsNum[i] = numericLabels(labelsText[i])
        i += 1

In [8]:
validationSize = [0.30, 0.28, 0.26, 0.24, 0.22, 0.20]
softMargin = [1.5, 2.5]

for i in validationSize:
    #splitting the dataset
    X_train, X_test, Y_train, Y_test = train_test_split(trainSet, Y_labels, test_size=i, random_state=2891)
    
    #resetting indices for the labelled sets so that they work with the pre written functions
    Y_train.reset_index(drop=True, inplace=True)
    Y_test.reset_index(drop=True, inplace=True)
    
    
    #Converting the labels to numerical values
    Y_train1 = Y_train
    Y_test1 = Y_test
    manageLabels(Y_train, Y_train1)
    manageLabels(Y_test, Y_test1)
    
    #converting type of Y to int
    Y_train1 = Y_train1.astype('int64')
    Y_test1 = Y_test1.astype('int64')
    
    #Readability shenanigans
    testing = 100 * i
    training = 100 - testing
    print '\n\n **** For %d/%d data split ratio **** : \n' %(training, testing)
    
    #Train the model (Poly with degree=3)
    for kernel in ('linear', 'poly', 'rbf'):
        clf = svm.SVC(kernel=kernel, C=2.5, degree=3)
        clf.fit(X_train, Y_train1)
        print "We successfully predict {0}% of data using {1} kernel for the training data".format(100-abs(clf.predict(X_train)-Y_train1).sum()/len(Y_train1), kernel)
    
    #Fit the model (Poly with degree=3,, C=1.5 and 2.5)
    for c in softMargin:
        print '\n With C = %.1f' %c   
        for kernel in ('linear', 'poly', 'rbf'):
            clf = svm.SVC(kernel=kernel, C=c, degree=3)
            clf.fit(X_test, Y_test1)
            correct=1.0*(clf.predict(X_test)==np.asarray(Y_test1)).sum()/len(Y_test1)
            print "We successfully predict {0}% of data using {1} kernel for the test data".format((correct)*100, kernel)



 **** For 70/30 data split ratio **** : 

We successfully predict 100% of data using linear kernel for the training data
We successfully predict 100% of data using poly kernel for the training data
We successfully predict 100% of data using rbf kernel for the training data

 With C = 1.5
We successfully predict 77.358490566% of data using linear kernel for the test data
We successfully predict 78.3018867925% of data using poly kernel for the test data
We successfully predict 85.8490566038% of data using rbf kernel for the test data

 With C = 2.5
We successfully predict 78.3018867925% of data using linear kernel for the test data
We successfully predict 82.0754716981% of data using poly kernel for the test data
We successfully predict 90.8805031447% of data using rbf kernel for the test data


 **** For 72/28 data split ratio **** : 

We successfully predict 100% of data using linear kernel for the training data
We successfully predict 100% of data using poly kernel for the training 

### Since SVM performed best on the Generalizability data, the final test set is used on it.

In [9]:
test = pd.read_csv('Data2/Data2_test.csv')

In [10]:
testSet = test.iloc[:,:10]

In [11]:
test.scenes.unique()

array(['bus', 'busystreet', 'train-ter', 'quietstreet', 'restaurant',
       'tube', 'market', 'tubestation', 'office', 'park', 'supermarket',
       'openairmarket'], dtype=object)

In [12]:
Y_labels = test.scenes
Y_labels[:15]

0             bus
1      busystreet
2       train-ter
3      busystreet
4     quietstreet
5      restaurant
6            tube
7          market
8     tubestation
9             bus
10     restaurant
11         office
12            bus
13     busystreet
14     busystreet
Name: scenes, dtype: object

In [13]:
Y_labels.reset_index(drop=True, inplace=True)

#Converting the labels to numerical values
Y_test = Y_labels
manageLabels(Y_test, Y_labels)

#converting type of Y to int
Y_test = Y_test.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
clf = svm.SVC(kernel='poly', C=2.5, degree=3)
clf.fit(X_train, Y_train1)
correct=1.0*(clf.predict(testSet)==np.asarray(Y_test)).sum()/len(Y_test)
print "We successfully predict {0}% of data using {1} kernel for the test data".format((correct)*100, 'poly')

We successfully predict 71.2121212121% of data using poly kernel for the test data
