#### What is Support Vector Machine?
Support Vector Machine's objective is to find a hyperplane in an N-dimensional space (N features) that distinctly classifies the data points. The margin (distance between the data points and the hyperplane) should be high in order to avoid misclassification errors.

In [16]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split

In [17]:
#import the breast cancer dataset
df = pd.read_csv('breast-cancer-wisconsin.csv')
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [18]:
#replace the missing values with -99999 so they are treated as outliers
df.replace('?', -99999, inplace = True)
#drop the id column since it doesnt impact the data in any way
df.drop(['id'], 1, inplace = True)
#remove unnecessary space in the column names
df.columns = df.columns.str.replace(' ', '')

In [19]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [21]:
clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
accuracy = clf.score(X_test, y_test)
accuracy

0.9357142857142857

In [28]:
example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [4, 2, 1, 1, 1, 2, 3, 2, 1]])
example_measures = example_measures.reshape(len(example_measures), -1)

In [29]:
prediction = clf.predict(example_measures)
prediction

array([2, 2])