from sklearn import svm

## Estimators¶
svm.LinearSVC([penalty, loss, dual, tol, C, …])	Linear Support Vector Classification.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

svm.LinearSVR([epsilon, tol, C, loss, …])	Linear Support Vector Regression.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR

svm.NuSVC([nu, kernel, degree, gamma, …])	Nu-Support Vector Classification.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html#sklearn.svm.NuSVC

svm.NuSVR([nu, C, kernel, degree, gamma, …])	Nu Support Vector Regression.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVR.html#sklearn.svm.NuSVR

svm.OneClassSVM([kernel, degree, gamma, …])	Unsupervised Outlier Detection.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM

svm.SVC([C, kernel, degree, gamma, coef0, …])	C-Support Vector Classification.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

svm.SVR([kernel, degree, gamma, coef0, tol, …])	Epsilon-Support Vector Regression.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR

svm.l1_min_c(X, y[, loss, fit_intercept, …])	Return the lowest bound for C such that for C in (l1_min_C, infinity) the model is guaranteed not to be empty.

https://scikit-learn.org/stable/modules/generated/sklearn.svm.l1_min_c.html#sklearn.svm.l1_min_c

## Building SVM Using Scikit-learn

#### Generate two data sets in scikit-learn¶

from sklearn.datasets import make_blobs

from sklearn.datasets import make_moons

import matplotlib.pyplot as plt

%matplotlib inline  

import numpy as np

plt.figure(figsize=(10, 10))

plt.subplot(221)

plt.title("Two blobs")
X_1, y_1 = make_blobs(n_features = 2, centers = 2, cluster_std=1.25, random_state = 123)

plt.scatter(X_1[:, 0], X_1[:, 1], c = y_1, s=25)

plt.subplot(222)

plt.title("Two blobs with more noise")

X_2, y_2 = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=3,  random_state = 123)

plt.scatter(X_2[:, 0], X_2[:, 1], c = y_2, s=25)

plt.subplot(223)


#### SVM to first dataset

from sklearn import svm

clf = svm.SVC(kernel='linear')

clf.fit(X_1, y_1)

Save the first feature (on the horizontal axis) as X_11 and the second feature (on the ver
tical axis) as X_12.

X_11= X_1[:,0]

X_12= X_1[:,1]

clf.coef_


#### Store the min and maximum values X_11 and X_12 operate in.

X11_min, X11_max = X_11.min() - 1, X_11.max() + 1

X12_min, X12_max = X_12.min() - 1, X_12.max() + 1

#### Create a grid using the numpy function linspace, which creates a numpy array with evenly spaced numbers over a specified interval. Specify num = 10 

x11_coord = np.linspace(X11_min, X11_max, 10)

x12_coord = np.linspace(X12_min, X12_max, 10)

#### Create decision boundary using np.meshgrid with the two arguments equal to the np.linspace objects created for X11 and X12.

X12_C, X11_C = np.meshgrid(x12_coord, x11_coord)

#### Create a numpy array (100,2) that concatenates the coordinates for X11 and X12 together in one numpy object. Use np.c_ and make sure to use .ravel() first. Use np.shape() on your resulting object first to verify the resulting shape.

x11x12 = np.c_[X11_C.ravel(), X12_C.ravel()]

np.shape(x11x12)

#### Get a decision boundary for this particular data set. Using your (100,2) numpy array and calling clf.decision_function() on it, the decision function returns the distance to the samples that you generated using meshgrid. Make sure you change your shape in a way that you get a (10,10) numpy array.

df1 = clf.decision_function(x11x12)

df1 = df1.reshape(X12_C.shape)

#### Plot our data again with the result of svm in it.

plt.scatter(X_11, X_12, c = y_1)
###### what comes next uses same axes as scatterplot
axes = plt.gca()

##### three lines for hyperplane and boundaries
axes.contour(X11_C, X12_C, df1, 
colors=["blue","black","blue"], levels= [-1, 0, 1], linestyles=[':', '-', ':'])

plt.show()

###### support vector coordinates
clf.support_vectors_

#### Highlight your support vectors in new plot

plt.scatter(X_11, X_12, c = y_1)

axes = plt.gca()

axes.contour(X11_C, X12_C, df1, colors= "black", levels= [-1, 0, 1], linestyles=[':', '-', ':'])

axes.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], facecolors='blue') 

plt.show()

#### SVM when data not linearly separable (dataset 2) using SVC function

plt.scatter(X_2[:, 0], X_2[:, 1], c=y_2, s=25)


clf = svm.SVC(kernel='linear')

clf.fit(X_2, y_2)

X_21= X_2[:,0]

X_22= X_2[:,1]

X21_min, X21_max = X_21.min() - 1, X_21.max() + 1

X22_min, X22_max = X_22.min() - 1, X_22.max() + 1

x21_coord = np.linspace(X21_min, X21_max, 10)

x22_coord = np.linspace(X22_min, X22_max, 10)

X22_C, X21_C = np.meshgrid(x22_coord, x21_coord)

x21x22 = np.c_[X21_C.ravel(), X22_C.ravel()]

df2 = clf.decision_function(x21x22)

df2= df2.reshape(X21_C.shape)

plt.scatter(X_21, X_22, c = y_2)

axes = plt.gca()

axes.contour(X21_C, X22_C, df2, colors=["blue","black","blue"], levels= [-1, 0, 1], linestyles=[':', '-', ':'])

plt.show()

#### Changing hyperparameter to adjust for misclassification

###### C-value hyperparameter increased to decrease boundary width

clf = svm.SVC(kernel='linear', C = 5000000) 

clf.fit(X_2, y_2)

###### code same as above to compile and plot new visual with adjusted hyperparameter

Other options in Scikit Learn
When you dig deeper in Scikit Learn, you'll notice that there are several ways to get to linear SVM's for classification:

#### What does One-vs-one mean? what does One-vs-all mean?

One-vs-one means that with $n$ classes, $\dfrac{(n)*(n-1)}{2}$ boundaries are constructed!

One-vs-all means that when there are $n$ classes, $n$ boundaries are created.

#### Classifying four classes

plt.figure(figsize=(7, 6))

plt.title("Four blobs")

X, y = make_blobs(n_samples=100, n_features=2, centers=4, cluster_std=1.6,  random_state = 123)

plt.scatter(X[:, 0], X[:, 1], c = y, s=25);

#### Try four different models and plot the results using subplots where:

- The first one is a regular SVC (C=1)
- The second one is a regular SVC with C=0.1
- The third one is a NuSVC with nu= 0.7
- The fourth one is a LinearSVC (no arguments)

X1= X[:,0]

X2= X[:,1]

X1_min, X1_max = X1.min() - 1, X1.max() + 1

X2_min, X2_max = X2.min() - 1, X2.max() + 1

x1_coord = np.linspace(X1_min, X1_max, 200)

x2_coord = np.linspace(X2_min, X2_max, 200)

X2_C, X1_C = np.meshgrid(x2_coord, x1_coord)

x1x2 = np.c_[X1_C.ravel(), X2_C.ravel()]

clf1 = svm.SVC(kernel = "linear",C=1) 

clf1.fit(X, y)

Z1 = clf1.predict(x1x2).reshape(X1_C.shape)

clf2 = svm.SVC(kernel = "linear",C=0.1) 

clf2.fit(X, y)

Z2 = clf2.predict(x1x2).reshape(X1_C.shape)

clf3 = svm.NuSVC(kernel = "linear",nu=0.7) 

clf3.fit(X, y)

Z3 = clf3.predict(x1x2).reshape(X1_C.shape)

clf4 = svm.LinearSVC() 

clf4.fit(X, y)

Z4 = clf4.predict(x1x2).reshape(X1_C.shape)

#### Plotting Figures 
plt.figure(figsize=(12, 12))

plt.subplot(221)

plt.title("SVC, C=1")

axes = plt.gca()

axes.contourf(X1_C, X2_C, Z1, alpha = 1)

plt.scatter(X1, X2, c = y, edgecolors = 'k')

axes.scatter(clf1.support_vectors_[:, 0], clf1.support_vectors_[:, 1], facecolors='blue', edgecolors= 'k') 

plt.subplot(222)

plt.title("SVC, C=0.1")

axes = plt.gca()

axes.contourf(X1_C, X2_C, Z2, alpha = 1)

plt.scatter(X1, X2, c = y, edgecolors = 'k')

axes.scatter(clf2.support_vectors_[:, 0], clf2.support_vectors_[:, 1], facecolors='blue', edgecolors= 'k') 

plt.subplot(223)

plt.title("NuSVC, nu=0.5")

axes = plt.gca()

axes.contourf(X1_C, X2_C, Z3, alpha = 1)

plt.scatter(X1, X2, c = y, edgecolors = 'k')
axes.scatter(clf3.support_vectors_[:, 0], clf3.support_vectors_[:, 1], facecolors='blue', edgecolors= 'k') 

plt.subplot(224)

plt.title("LinearSVC")

axes = plt.gca()

axes.contourf(X1_C, X2_C, Z4, alpha = 1)

plt.scatter(X1, X2, c = y, edgecolors = 'k')

plt.show()

#### Coefficients of the decision boundaries

print(clf2.coef_)

print(clf4.coef_)




#### Max Margin Classifier (Raw) (for clearly defined boundaries)

###### label classes from dataset

class_1 = X[labels==0]

class_2 = X[labels==1]

import cvxpy as cp

d = 2  

m = 50 

n = 50  

###### Define the variables
w = cp.Variable(d)

b = cp.Variable()

###### Define the constraints
x_constraints = [w.T * class_1[i] + b >= 1  for i in range(m)]
y_constraints = [w.T * class_2[i] + b <= -1 for i in range(n)]

###### Sum the constraints
constraints = x_constraints +  y_constraints 

###### Define the objective. Hint: use cp.norm
obj = cp.Minimize(cp.norm(w,2))

###### Add objective and constraint in the problem
prob = cp.Problem(obj, constraints)

###### Solve the problem
prob.solve()
print("Problem Status: %s"%prob.status)

###### if problem status says "infeasible", use soft margin classifier

#### Plotting Boundaries

def plotBoundaries(x, y, w, b):

##### Takes in a set of datapoints x and y for two clusters,
    d1_min = np.min([x[:,0],y[:,0]])
    d1_max = np.max([x[:,0],y[:,0]])
###### Line form: (-a[0] * x - b ) / a[1]
    d2_at_mind1 = (-w[0]*d1_min - b ) / w[1]
    d2_at_maxd1 = (-w[0]*d1_max - b ) / w[1]
    sup_up_at_mind1 = (-w[0]*d1_min - b + 1 ) / w[1]
    sup_up_at_maxd1 = (-w[0]*d1_max - b + 1 ) / w[1]
    sup_dn_at_mind1 = (-w[0]*d1_min - b - 1 ) / w[1]
    sup_dn_at_maxd1 = (-w[0]*d1_max - b - 1 ) / w[1]

###### Plot the clusters!
    plt.scatter(x[:,0],x[:,1],color='purple')
    plt.scatter(y[:,0],y[:,1],color='yellow')
    plt.plot([d1_min,d1_max],[d2_at_mind1 ,d2_at_maxd1],color='black')
    plt.plot([d1_min,d1_max],[sup_up_at_mind1,sup_up_at_maxd1],'-.',color='blue')
    plt.plot([d1_min,d1_max],[sup_dn_at_mind1,sup_dn_at_maxd1],'-.',color='blue')
    plt.ylim([np.floor(np.min([x[:,1],y[:,1]])),np.ceil(np.max([x[:,1],y[:,1]]))])

#### Soft Margin Classifier (Raw)

import cvxpy as cp

d = 2

m = 50 

n = 50  

###### Define the variables
w = cp.Variable(d)

b = cp.Variable()

ksi_1 = cp.Variable(m)

ksi_2 = cp.Variable(n)

C=0.01

###### Define the constraints
x_constraints = [w.T * class_1[i] + b >= 1 - ksi_1[i]  for i in range(m)]

y_constraints = [w.T * class_2[i] + b <= -1 + ksi_2[i] for i in range(n)]

ksi_1_constraints = [ksi_1 >= 0  for i in range(m)]

ksi_2_constraints = [ksi_2 >= 0  for i in range(n)]

###### Sum the constraints
constraints = x_constraints +  y_constraints + ksi_1_constraints + ksi_2_constraints

###### Define the objective. Hint: use cp.norm. Add in a C hyperparameter and assume 1 at first
obj = cp.Minimize(cp.norm(w,2)+ C * (sum(ksi_1)+ sum(ksi_2)))

###### Add objective and constraint in the problem
prob = cp.Problem(obj, constraints)

###### Solve the problem
prob.solve()

print("Problem Status: %s"%prob.status)

## SVM with Kernals

#### Create two non-linear datasets

from sklearn.datasets import make_blobs

from sklearn.datasets import make_moons

import matplotlib.pyplot as plt

%matplotlib inline  

from sklearn import svm

from sklearn.model_selection import train_test_split

import numpy as np

plt.figure(figsize=(10, 4))

plt.subplot(121)

plt.title("Four blobs")

X_3, y_3 = make_blobs(n_samples=100, n_features=2, centers=4, cluster_std=1.6, random_state = 123)
plt.scatter(X_3[:, 0], X_3[:, 1], c = y_3, s=25)

plt.subplot(122)

plt.title("Two interleaving half circles")

X_4, y_4 = make_moons(n_samples=100, shuffle = False , noise = 0.3, random_state=123)
plt.scatter(X_4[:, 0], X_4[:, 1], c = y_4, s=25)

plt.show()

#### RBF Kernal (Radial Basis Function)

##### Radial basis function kernel has 2 hyperparameters: C and gamma.

###### Create a loop that builds a model for each of the combinations
C_range =  np.array([0.1, 1, 10])  # [0.01, 10]

gamma_range =  np.array([0.1, 1, 100]) # [1, 100] 

param_grid = dict(gamma=gamma_range, C=C_range)

details = []

for C in C_range:
    for gamma in gamma_range:
        clf = svm.SVC(C=C, gamma=gamma)
        clf.fit(X_4, y_4)
        details.append((C, gamma, clf))

#### Prepare your data for plotting
X1= X_4[:,0]

X2= X_4[:,1]

X1_min, X1_max = X1.min() - 1, X1.max() + 1

X2_min, X2_max = X2.min() - 1, X2.max() + 1

x1_coord = np.linspace(X1_min, X1_max, 500)

x2_coord = np.linspace(X2_min, X2_max, 500)

X2_C, X1_C = np.meshgrid(x2_coord, x1_coord)

x1x2 = np.c_[X1_C.ravel(), X2_C.ravel()]

#### Plot the prediction results in 9 subplots  
plt.figure(figsize=(11, 11))

for (k, (C, gamma, clf)) in enumerate(details):
###### evaluate the predictions in a grid
    Z = clf.predict(x1x2)  
    Z = Z.reshape(X1_C.shape)

###### visualize decision function for these parameters
    plt.subplot(3, 3, k + 1)
    plt.title("gam= %r, C= %r, score = %r"  % (gamma, C, round(clf.score(X_4,y_4), 2)))

###### visualize parameter's effect on decision function
    plt.contourf(X1_C, X2_C, Z, alpha = 1)
    plt.scatter(X_4[:, 0], X_4[:, 1], c=y_4,  edgecolors='gray')
    plt.axis('tight')

#### Repeat but use decision_function instead of predict

#### Plot the prediction results in 9 subplots  
plt.figure(figsize=(12, 12))

for (k, (C, gamma, clf)) in enumerate(details):
###### evaluate the decision functions in a grid
    Z = clf.decision_function(x1x2)  
    Z = Z.reshape(X1_C.shape)

###### visualize decision function for these parameters
    plt.subplot(3, 3, k + 1)
    plt.title("gam= %r, C= %r, score = %r"  % (gamma, C, round(clf.score(X_4,y_4), 2)))

###### visualize parameter's effect on decision function
    plt.contourf(X1_C, X2_C, Z, alpha = 1)
    plt.scatter(X_4[:, 0], X_4[:, 1], c=y_4,  edgecolors='gray')
    plt.axis('tight')

#### Polynomial kernel
##### Polynomial kernel has 3 hyperparameters:

$\gamma$, which can be specified using keyword gamma

$r$, which can be specified using keyword coef0

$d$, which can be specified using keyword degree

$r= 0.1$ and $2$
$\gamma= 0.1$ and $1$
$d= 3$ and $4$


##### Note that decision_function() cannot be used on a classifier with more than two classes.

#### Create a loop that builds a model for each combination

r_range =  np.array([0.1, 2])  # [0.01, 10]

gamma_range =  np.array([0.1, 1]) # [1, 100] 

d_range = np.array([3, 4])

param_grid = dict(gamma=gamma_range, degree = d_range, coef0 = r_range)

details = []

for d in d_range:
    for gamma in gamma_range:
         for r in r_range:
            clf = svm.SVC(kernel = "poly", coef0 = r , gamma=gamma, degree= d)
            clf.fit(X_3, y_3)
            details.append((r, d, gamma, clf))

#### Prepare your data for plotting
X1= X_3[:,0]

X2= X_3[:,1]

X1_min, X1_max = X1.min() - 1, X1.max() + 1

X2_min, X2_max = X2.min() - 1, X2.max() + 1

x1_coord = np.linspace(X1_min, X1_max, 500)

x2_coord = np.linspace(X2_min, X2_max, 500)

X2_C, X1_C = np.meshgrid(x2_coord, x1_coord)

x1x2 = np.c_[X1_C.ravel(), X2_C.ravel()]

#### Plot the prediction results 

plt.figure(figsize=(12, 14))

for (k, (r, d,gamma, clf)) in enumerate(details):
###### evaluate the predictions in a grid
    Z = clf.predict(x1x2)  
    Z = Z.reshape(X1_C.shape)

###### visualize decision function for these parameters
    plt.subplot(4, 2, k + 1)
    plt.title("d= %r, gam= %r, r = %r , score = %r"  % (d, gamma,r, round(clf.score(X_3,y_3), 2)))

###### visualize parameter's effect on decision function
    plt.contourf(X1_C, X2_C, Z, alpha = 1)
    plt.scatter(X_3[:, 0], X_3[:, 1], c=y_3,  edgecolors='gray')
    plt.axis('tight')

#### The Sigmoid Kernel

##### Sigmoid kernel has 2 hyperparameters:

$\gamma$, which can be specified using keyword gamma

$r$, which can be specified using keyword coef0

#### Create a loop that builds a model for each combination
r_range =  np.array([0.01, 1, 10])  

gamma_range =  np.array([0.001, 0.01, 0.1]) 

param_grid = dict(gamma=gamma_range,coef0 = r_range)

details = []

for gamma in gamma_range:
     for r in r_range:
        clf = svm.SVC(kernel = "sigmoid", coef0 = r , gamma=gamma)
        clf.fit(X_3, y_3) 
        details.append((r, gamma, clf))

#### Prepare your data for plotting
X1= X_3[:,0]

X2= X_3[:,1]

X1_min, X1_max = X1.min() - 1, X1.max() + 1

X2_min, X2_max = X2.min() - 1, X2.max() + 1

x1_coord = np.linspace(X1_min, X1_max, 500)

x2_coord = np.linspace(X2_min, X2_max, 500)

X2_C, X1_C = np.meshgrid(x2_coord, x1_coord)

x1x2 = np.c_[X1_C.ravel(), X2_C.ravel()]

# Plot the prediction results 
plt.figure(figsize=(12, 14))

for (k, (r, gamma, clf)) in enumerate(details):
###### evaluate the predictions in a grid
    Z = clf.predict(x1x2)  
    Z = Z.reshape(X1_C.shape)

###### visualize decision function for these parameters
    plt.subplot(3, 3, k + 1)
    plt.title(" gam= %r, r = %r , score = %r"  % (gamma,r, round(clf.score(X_3,y_3), 2)))

###### visualize parameter's effect on decision function
    plt.contourf(X1_C, X2_C, Z, alpha = 1)
    plt.scatter(X_3[:, 0], X_3[:, 1], c=y_3,  edgecolors='gray')
    plt.axis('tight')

##### Note: The polynomial kernel is very sensitive to the hyperparameter settings. Especially setting a "wrong" gamma can have a dramatic effect on the model performance


#### Real world data set with higher dimensions

##### Note: cannot visually represent data with more than 3 dimensions

import statsmodels as sm

import sklearn.preprocessing as preprocessing

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

import pandas as pd

salaries = pd.read_csv("salaries_final.csv", index_col = 0)

salaries.head()


target = pd.get_dummies(salaries.Target, drop_first=True)

xcols = salaries.columns[:-1]

data = pd.get_dummies(salaries[xcols], drop_first=True)

#### Build Linear SVC

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = 0.25, random_state=123)

import time

start_time = time.time()

clf = svm.SVC(probability=True)

clf.fit(data_train, target_train['>50K'])

total =(time.time() - start_time)

total/60

clf.predict_proba(data_test)

clf.score(data_test, target_test['>50K'])