# SVM is being applied to the rider.csv data.

## 1. Setup the libraries

Import modules

In [28]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)

## 2. Load data

Load data (it's already cleaned and preprocessed)

In [29]:
# Uncomment the following snippet of code to debug problems with finding the .csv file path
# This snippet of code will exit the program and print the current working directory.
#import os
#print(os.getcwd())

In [30]:
riding_mower=pd.read_csv("RidingMowers.csv") # let's use the same data as we did in the logistic regression example
riding_mower.head(3)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [31]:
riding_mower.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [32]:
riding_mower.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [33]:
# Encoding the categorical variable using one hot encoding

In [34]:
dummies_df = pd.get_dummies(riding_mower['Ownership'], prefix='Ownership', drop_first=True)
riding_mower = riding_mower.join(dummies_df)
riding_mower.drop('Ownership', axis=1, inplace = True)

In [35]:
riding_mower.head(6)

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1
4,87.0,23.6,1
5,110.1,19.2,1


# Step 4  Splitting the data into training and testing 

In [36]:
X = riding_mower.loc[:,['Income','Lot_Size']]
y = riding_mower.loc[:,['Ownership_Owner']]

In [37]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3)

## Step 5 Model the data

First, let's create a dataframe to load the model performance metrics into.

In [38]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 3.1 Fit a SVM classification model using linear kernal

In [39]:
svm_lin_model = SVC(kernel="linear",probability=True)
svm_lin = svm_lin_model.fit(X_train, np.ravel(y_train))

In [40]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [41]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0


In [42]:
riding_mower["predicted"]=svm_lin.predict(X)
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted
0,60.0,18.4,1,0
1,85.5,16.8,1,1
2,64.8,21.6,1,1
3,61.5,20.8,1,1
4,87.0,23.6,1,1
5,110.1,19.2,1,1
6,108.0,17.6,1,1
7,82.8,22.4,1,1
8,69.0,20.0,1,1
9,93.0,20.8,1,1


In [43]:
riding_mower['pred_prob'] = svm_lin.predict_proba(X)[:,1]
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


### 3.2 Fit a SVM classification model using rbf kernal

In [44]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
svm_rbf = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [45]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [46]:
riding_mower["predicted"]=svm_rbf.predict(X)
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


In [47]:
riding_mower['pred_prob'] = svm_rbf.predict_proba(X)[:,1]
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


### 3.3 Fit a SVM classification model using polynomial kernal

In [48]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10,probability=True)
svm_poly = svm_poly_model.fit(X_train, np.ravel(y_train))

In [49]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [50]:
riding_mower["predicted"]=svm_poly.predict(X)
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


In [51]:
riding_mower['pred_prob'] = svm_poly.predict_proba(X)[:,1]
riding_mower

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.558405
1,85.5,16.8,1,1,0.56233
2,64.8,21.6,1,1,0.565553
3,61.5,20.8,1,1,0.562769
4,87.0,23.6,1,1,0.578904
5,110.1,19.2,1,1,0.570517
6,108.0,17.6,1,1,0.565548
7,82.8,22.4,1,1,0.574482
8,69.0,20.0,1,1,0.56444
9,93.0,20.8,1,1,0.573509


In [52]:
## 4.0 Summary

performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8


In [53]:
performance.sort_values(by=['Accuracy'])
performance.sort_values(by=['Precision'])
performance.sort_values(by=['Recall'])
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


## Analysis

> The metric to optimize should be determined by the business challenge, as well as the costs associated with false positives and false negatives.From the above results, we can observe that the linear SVM model is overfitting,but when we look at the other two models rbf svm and poly svm and compare their accuracy,recall,F1 and precision values, Poly svm has greater values, hence I believe Polynomial is the better model.

In [54]:
import pickle

# save model
pickle.dump(svm_poly, open(r'C:/Users/hiran/DSP 6251/best_svm_model.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))