# Imports and Datasets Preparations

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, r2_score, mean_squared_error
from pprint import pprint

## Titanic Dataset for Classification

In [2]:
titanic_data = pd.read_csv('datasets/titanic/titanic_dataset.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# split train, validation and test data: 70:15:15 division
# use random seed to have always the same split if we make different analyses

training_data_clas = titanic_data.sample(frac=0.7, random_state=1234)
data_without_train_clas = titanic_data.drop(training_data_clas.index)
validation_data_clas = data_without_train_clas.sample(frac=0.5, random_state=27)
test_data_clas = data_without_train_clas.drop(validation_data_clas.index)

In [4]:
print(f'Dataset split:\ntraining data: {(len(training_data_clas))}\nvalidation data: {(len(validation_data_clas))}\ntest data: {(len(test_data_clas))}')

Dataset split:
training data: 624
validation data: 134
test data: 133


In [5]:
# for curiosity check how many female and male survived

woman = training_data_clas.loc[training_data_clas.Sex == 'female']['Survived'] # 1: survived, 0: died
rate_woman_survived = sum(woman) / len(woman)
rate_woman_survived

0.7358490566037735

In [6]:
men = training_data_clas.loc[training_data_clas.Sex == 'male']['Survived']
rate_men_survived = sum(men) / len(men)
rate_men_survived

0.19174757281553398

### Features and Target preparation

In [7]:
# the prediction to learn
                
y = training_data_clas['Survived'] # Target of training set
y_val = validation_data_clas['Survived'] # Target of validation set
y.head()

523    1
778    0
760    0
496    1
583    0
Name: Survived, dtype: int64

In [8]:
# the features vector, where we create dummies values to be evaluated

features = ['Pclass', 'Sex', 'SibSp', 'Parch']

X = pd.get_dummies(training_data_clas[features])
X_val = pd.get_dummies(validation_data_clas[features])
X.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
523,1,0,1,1,0
778,3,0,0,0,1
760,3,0,0,0,1
496,1,1,0,1,0
583,1,0,0,0,1


## Boston Housing Dataset for Regression

In [9]:
boston_housing = pd.read_csv('datasets/boston_housing/BostonHousing.csv')
boston_housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [10]:
boston_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [11]:
training_data_reg = boston_housing.sample(frac=0.7, random_state=1234)
data_without_train_reg = boston_housing.drop(training_data_reg.index)
validation_data_reg = data_without_train_reg.sample(frac=0.5, random_state=27)
test_data_reg = data_without_train_reg.drop(validation_data_reg.index)

In [12]:
print(f'Dataset split:\ntraining data: {(len(training_data_reg))}\nvalidation data: {(len(validation_data_reg))}\ntest data: {(len(test_data_reg))}')

Dataset split:
training data: 354
validation data: 76
test data: 76


### Features and Target preparation

In [13]:
# target training and validation
                
yreg = training_data_reg['medv'] # Target of training set
yreg_val = validation_data_reg['medv'] # Target of validation set
yreg.head()

64     33.0
100    27.5
400     5.6
485    21.2
454    14.9
Name: medv, dtype: float64

In [14]:
# the features vector

features_reg = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']

Xreg = training_data_reg[features_reg]
Xreg_val = validation_data_reg[features_reg]
Xreg.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
64,0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05
100,0.14866,0.0,8.56,0,0.52,6.727,79.9,2.7778,5,384,20.9,394.76,9.42
400,25.0461,0.0,18.1,0,0.693,5.987,100.0,1.5888,24,666,20.2,396.9,26.77
485,3.67367,0.0,18.1,0,0.583,6.312,51.9,3.9917,24,666,20.2,388.62,10.58
454,9.51363,0.0,18.1,0,0.713,6.728,94.1,2.4961,24,666,20.2,6.68,18.71


# SVM (Support Vector Machine)

Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.
<br>
The objective of the SVM algorithm is to find a hyperplane in an N-dimensional space (where N is the number of features) that distinctly classifies the data.
<br>
For simplicity, we consider an example with 2 classes:
<center><img src="images/svm/classes.png" width="350" center=/></center>
As we can see we should split this two classes with many different hyperplanes, what do we choose?
<center><img src="images/svm/hyperplanes.png" width="350" center=/></center>
SVM works by searching for the hyperplane which maximizes the margin or the largest $\gamma$ such that:
$$
\forall{i,y_i}(wx_i + b) >= \gamma
$$
<center><img src="images/svm/optimization.png" width="350" center=/></center>
The support vectors are the data points that are closer to the hyperplane and influence the position and orientation of this one. In addition we can using only these support vectors to classify our new points instead of all the dataset. Using this techniques guaranteed a more robust classifier.

The advantages of support vector machines are:
- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:
- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

## Classification

In [24]:
# in our dataset we will be using SCV which is a specific type of SVM which is used with classification application

from sklearn.svm import SVC

svc_model = SVC()
pprint(svc_model.get_params())

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


### Main Parameters

the main parameters for SVC are:

- kernel: it selects the type of hyperplane used to separate the data
    - linear: a linear hyperplane
    - rfb: a non-linear hyperplane
    - poly: a non-linear hyperplane
    - sigmoid
- gamma: a parameter for non linear hyperplanes, higher the value more tries to fit the training data (risk overfitting)
- C: regularization parameter, the penalty is a squared l2 penalty (higher more regularization, risk underfitting)
- degree: parameter for poly kernel, we can choose the degree of the polynomial used for the hyperplane (using =1 is the same of use linear kernel, greater it fits more the training data)

In [25]:
svc_model.fit(X, y)
svc_preds = svc_model.predict(X_val)
#accuracy_score(svc_preds, y_val)
print(classification_report(svc_preds, y_val))

              precision    recall  f1-score   support

           0       0.86      0.79      0.82        85
           1       0.68      0.78      0.72        49

    accuracy                           0.78       134
   macro avg       0.77      0.78      0.77       134
weighted avg       0.79      0.78      0.79       134



## Regression

In [26]:
# parameter analysis is very similar to that done previously for the classification model

from sklearn.svm import SVR

svmreg_model = SVR()
pprint(svmreg_model.get_params())

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


In [27]:
svmreg_model.fit(Xreg, yreg)
svmreg_preds = svmreg_model.predict(Xreg_val)
r2_score(yreg_val, svmreg_preds)

0.13530293450130937