# Linear SVM AND Naive Bayes (with sklearn)

### Importing the libraries

In [13]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

### Load Data Set

In [14]:
dataset = pd.read_csv("../Datasets/processed.cleveland.data.csv", names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','tha1','output'])
dataset_mean = dataset


### Data Preprocessing

In [15]:
# Filling missing values Statistics measures
print("**Before Fill Missing values Row 166, 192, 287, 302**")
print(dataset_mean.loc[287])
dataset1 = dataset_mean
df1 = pd.DataFrame(dataset1)
# print(df1)

print("------Mean of Column 11 'ca'------")
print(df1['ca'].mean())
df1.fillna(df1.mean(), inplace=True)
print("**After Fill Misisng values Row 166, 192, 287, 302**")
print(df1.loc[[166,192,287,302]])

print("------Mean of Column 12 'tha1'------")
print(df1['tha1'].mean())
df1.fillna(df1.mean(), inplace=True)
print("**After Fill Misisng values Row 87, 266**")
print(df1.loc[[87,266]])

# Extract feature columns
feature_cols = list(dataset.columns[0:13])

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))

# Seperate the data into feature data and target data (X_all and y_all, respectively)
x = dataset[feature_cols]
y = dataset['output'].values

# Show the feature information by printing the first five rows
print("\nFeature values:")
x.head()

**Before Fill Missing values Row 166, 192, 287, 302**
age          58.0
sex           1.0
cp            2.0
trestbps    125.0
chol        220.0
fbs           0.0
restecg       0.0
thalach     144.0
exang         0.0
oldpeak       0.4
slope         2.0
ca            NaN
tha1          7.0
output        0.0
Name: 287, dtype: float64
------Mean of Column 11 'ca'------
0.6722408026755853
**After Fill Misisng values Row 166, 192, 287, 302**
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
166   52    1   3       138   223    0        0      169      0      0.0   
192   43    1   4       132   247    1        2      143      1      0.1   
287   58    1   2       125   220    0        0      144      0      0.4   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope        ca  tha1  output  
166      1  0.672241   3.0       0  
192      2  0.672241   7.0       1  
287      2  0.672241   7.0       0  
302      1  0.672241   3.0      

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,tha1
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0


### Split the dataset into Training and testing data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=5)
print(x_train)

# Normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(x_train)
x_train = scaler.transform(x_train)
print("-----After Z-Score Normalization on x_train-----")
print(x_train)

scaler.fit(x_test)
x_test = scaler.transform(x_test)
print("-----After Z-Score Normalization on x_test-----")
print(x_test)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
3     37    1   3       130   250    0        0      187      0      3.5   
55    54    1   4       124   266    0        2      109      1      2.2   
225   34    0   2       118   210    0        0      192      0      0.7   
224   63    0   4       108   269    0        0      169      1      1.8   
75    65    0   3       160   360    0        2      151      0      0.8   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
8     63    1   4       130   254    0        2      147      0      1.4   
73    65    1   4       110   248    0        2      158      0      0.6   
118   63    1   4       130   330    1        2      132      1      1.8   
189   69    1   3       140   254    0        2      146      0      2.0   
206   58    1   4       128   259    0        2      130      1      3.0   

     slope   ca  tha1  
3        3  0.0   3.0  
55       2  1.0   7.0  
225      1  0.0

### Training and Prediction through SVM Classifier Model

In [17]:
print("Linear SVM")
svm_model_linear = SVC(kernel = 'linear')
svm_model_linear.fit(x_train, y_train)
y_pred = svm_model_linear.predict(x_test)

cm1 = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n", cm1)
print("Accuracy=", accuracy_score(y_test, y_pred))

Linear SVM
Confusion Matrix:
 [[48  4  0  1  0]
 [ 4  6  0  3  0]
 [ 3  5  2  2  1]
 [ 0  3  2  4  1]
 [ 0  1  0  1  0]]
Accuracy= 0.6593406593406593


### Training and Prediction through a Naive Bayes Classifier

In [18]:
from sklearn.naive_bayes import GaussianNB
print("Naive Bayes")
gnb = GaussianNB().fit(x_train, y_train)
y_pred = gnb.predict(x_test)

cm2 = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n", cm2)
print("Accuracy=", accuracy_score(y_test, y_pred))

Naive Bayes
Confusion Matrix:
 [[45  6  2  0  0]
 [ 5  4  1  3  0]
 [ 0  4  4  4  1]
 [ 0  0  6  2  2]
 [ 0  0  1  1  0]]
Accuracy= 0.6043956043956044
