# SVM for an Explainable Migraine Classification

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split

In [2]:
# data paths
DATASET_PATH = "./migraine.xls"

In [3]:
data = pd.read_excel(DATASET_PATH, skiprows=1)

# columns name cleaner 
d_columns_temp = data.columns
d_columns = []

for c in d_columns_temp:
    c_cleaned = c.split("\n")[0].lower()
    d_columns.append(c_cleaned)

data.columns = d_columns

# features groups
groups_temp = list(pd.read_excel(DATASET_PATH).columns.values)
groups = []

for index in range(len(groups_temp)):
    if "NA:" not in groups_temp[index] and "Unnamed:" not in groups_temp[index]:
        groups.append(groups_temp[index])
        
groups

['Demografiche',
 'Caratteristiche cliniche',
 'Terapia',
 'Sintomatologia',
 'Caratteristiche molecolari',
 'Biochimica clinica',
 'Trigger']

In [4]:
data.head()

Unnamed: 0,cod.,età (all'arruolamento),genere,menopausa,menarca (età),emicrania,diagnosi,cefalea,lato,onset (età),...,oc/hrt si=1/no=0,familiarità per patologie cerebrovascolari,familiarità per cardiopatia,ipertensione arteriosa,aritmie si=1/no=0,patologie cerebrovascolari,ima si=1/no=0,dislipidemia si=1/no=0,diabete si=1/no=0,risposta triptani ricodificata
0,BB12592,42.0,1,0.0,,2,ESA,0,1.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SOD001,42.0,1,0.0,15.5,3,EC,1,2.0,30.0,...,,,,,,,,,,1.0
2,SOD002,38.0,1,0.0,12.5,3,EC+MOH,1,1.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,SOD003,44.0,0,,,3,EC+MOH,1,1.0,16.0,...,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,SOD004,72.0,0,,,3,EC,1,1.0,41.0,...,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
data.describe()

In [None]:
#data.info()

## Fill NaN data

In [None]:
import random

def fill_binary_columns(list_of_indexes):
    for index in list_of_indexes:
        data[data.columns[index]] = data[data.columns[index]].fillna(-1)
        
def fill_categorical_columns(list_of_indexes):
    for index in list_of_indexes:
        filler = random.choice(list(data[data.columns[index]].unique()))
        data[data.columns[index]] = data[data.columns[index]].fillna(filler)
        
def fill_range_columns(list_of_indexes):
    for index in list_of_indexes:
        data[data.columns[index]] = data[data.columns[index]].fillna(-1)

def get_unique_range_value(index):
    return list(data[data.columns[index]].unique())
    
def categorical_to_integer(row, c):
    c = get_unique_range_value(c)
    return  c.index(row)

In [None]:
# fill NaN values
columns = data.columns

binary_columns = [2,3,7,8,10,14,15,16,13,17,22,23,24,25,73,74, 75, 77,78,79,80,81,82,83,84,85,86]
categorical_columns = [20,21]
categorical_columns.extend(list(range(26,71)))
range_columns = [12]
range_columns.extend(categorical_columns)

fill_binary_columns(binary_columns)
fill_categorical_columns(categorical_columns)
fill_range_columns(range_columns)

for c in range(len(columns)):
    if c not in binary_columns and c not in categorical_columns and c not in range_columns and data[columns[c]].dtype != object:
        data[columns[c]] = data[columns[c]].fillna(np.mean(data[columns[c]])) 

for c in range(len(range_columns)):
    data[columns[c]] = data[columns[c]].apply(categorical_to_integer, args=(c,))

In [None]:
data.values[500]

# Features Groups

In [None]:
groups_indexes = [range(1,4), range(5,15), range(16,20), range(22, 25), [27], range(46,71), range(72,77)]
groups_dictionary = {}

for gi in range(len(groups_indexes)):
    groups_dictionary[groups[gi]] = list(groups_indexes[gi])
    if 12 in groups_dictionary[groups[gi]]:
        groups_dictionary[groups[gi]].remove(12)
    


In [None]:
from sklearn.utils import shuffle

class Dataset():
    def __init__(self, train, target_column, groups_dictionary):
        self.data = train
        self.target_column = target_column
        self.groups = groups_dictionary
        self.dataset = self.create_dataset()
        self.dataset = shuffle(self.dataset)

        
    
    def create_dataset(self):
        codes = []
        g1, g2, g3, g4, g5, g6, g7 = [], [], [], [], [], [], []
        vectors = [g1, g2, g3, g4, g5, g6, g7]
        targets = []
        
        for e in self.data.values:
            codes.append(e[0])
            for index in range(len(self.groups)):
                vectors[index].append(np.array(e[self.groups[list(self.groups.keys())[index]]], dtype=object), )
            targets.append(e[self.target_column])
        
        columns = ["code"]
        columns.extend(groups)
        columns.append("target")
        print(len(g5))
        self.dataset_target = 8
        return pd.DataFrame({columns[0]:codes, columns[1]:g1,columns[2]:g2,columns[3]:g3, columns[4]:g4,
                             columns[5]:g5, columns[6]:g6, columns[7]:g7, columns[8]:targets})
    
    # function to split data in train and test, take in input the dataset, target_column name, 
    # test_size and random_state and return two tuple of training and test_set 
    def split_dataset(self, test_size=0.2, random_state = 42):
        x_train, x_test, y_train, y_test = train_test_split(self.dataset.drop(["target"], axis=1), self.dataset["target"], 
                                                            test_size=test_size, random_state=random_state)
        return (x_train, y_train), (x_test, y_test)
    
    def get_vectors(self, data):
        v = []
        for d in data.values:
            v.append(d[1:])
        return np.array(v)
    
    
    
    

In [None]:
TARGET_COLUMN = 12

dataset = Dataset(data, TARGET_COLUMN, groups_dictionary)
train, test = dataset.split_dataset(test_size=0.3)


## Target Distribution

Umbalanced dataset (**target 2**)

In [None]:
import seaborn as sns

sns.countplot(dataset.dataset["target"])

## Defining the SVM model and the kernel function
We can now define the svm model with sklearn library. We can also create a particular kernel function and pass it like argument to the svm

```python
>>> from sklearn import svm
>>> def my_kernel(X, Y):
...     return np.dot(X, Y.T)
...
>>> clf = svm.SVC(kernel=my_kernel)
```

## Kernel 

kernel function --> (a, b) --> (a * b) / sqrt(a^2 * b^2) --> result
- a and b are group vector


result --> ArrayMath.cosine(vector, weights) --> result
- values are groups vector
- weights


In [None]:
dataset.dataset

In [None]:
from sklearn import svm

def K(X,Y):
    # function that compute single example function
    v = []
    for a,b in zip(X,Y):
        # vector of computed values
        v.append(compute_kernel(a, b))
    # return vector with weight 
    return np.cos(np.dot(v, weights))


def compute_kernel(X,Y):
    # single component function
    try:
        norm_x = np.dot(X,X)
        norm_y = np.dot(Y,Y)
        if norm_x != 0  and norm_y != 0:
            return np.divide(np.dot(X, Y), np.sqrt(np.dot(norm_x, norm_y)))
        else:
            return 0
    except:
        return 0

def kernel_function(X, Y):
        vector = []
        gram_matrix = np.zeros((X.shape[0], Y.shape[0]))
        for i, x in enumerate(X):
            for j, y in enumerate(Y):
                gram_matrix[i, j] = K(x, y)
        return gram_matrix          


class GroupKernel():
    def __init__(self, weights):
        self.weights = weights

## SVM Testing

In [None]:
weights = [1, 1, 1, 1, 1, 1, 1]
group_kernel = GroupKernel(weights)

x = dataset.get_vectors(train[0])
y = train[1]
gram_matrix = kernel_function(x, x)
clf = svm.SVC(kernel = kernel_function) 


#print(kernel_function(train[0].values[0][1:], train[0].values[1][1:]))
clf.fit(x, y.values)

In [None]:
gram_matrix

In [None]:
clf.predict(dataset.get_vectors(test[0]))

## Experiments

In [None]:
def create_prediction_csv(data, preds, g_truth, file_id=0):
    output = pd.DataFrame({"cod": data, "preds": preds, "real_value":g_truth})
    output.to_csv(f"./experiments/predictions-{file_id}.csv")
    return True

In [None]:
import random
from sklearn.metrics import precision_score, recall_score, f1_score
import time

def random_search(max_iteration=20, iteration_id=0):
    
    best_iteration = 0
    it = 0
    best_f1 = -1
    output = open(f"./experiments/performances-{iteration_id}.txt", "w")
    
    while(best_iteration < max_iteration - 1):
        start_time = time.process_time()

        # total iteration
        print(f"Iteration: {it+1}") 
        print(f"============================")


        # random weights initialization
        global weights
        
        weights = np.zeros(7)
        for w in range(len(weights)):
            weights[w] = random.uniform(0,1)

        group_kernel = GroupKernel(weights)

        # values
        x = dataset.get_vectors(train[0])
        
        # labels
        y = train[1]

        # svm with group kernel function
        clf = svm.SVC(kernel = kernel_function) 

        # learning
        clf.fit(x, y.values)
        
        # predictions
        preds = clf.predict(dataset.get_vectors(test[0]))
        
        
        # evaluation
        precision = precision_score(preds, test[1].values, average="macro")
        recall = recall_score(preds, test[1].values, average="macro") 
        f1 = f1_score(preds, test[1].values, average="macro")
        
        
        print(f"Precision: {precision} Recall: {recall} F1: {f1}\nWeights: {weights}\n")
        # total time computation
        total_time = time.process_time() - start_time
        print(f"Total time execution: {total_time}")
        # write execution time to file
        time_file = open(f"./experiments/time_of_execution-{iteration_id}.txt", "a")
        time_file.write(f"{total_time}\n")
        time_file.close  
        
        
        if f1 > best_f1:
            print("Best F1")            
            best_iteration = 0
            best_f1 = f1  
            output.write(f"Precision: {precision} Recall: {recall} F1: {f1}\nWeights: {weights}")
            
            create_prediction_csv(data=test[0]["code"], preds=preds, g_truth=test[1].values, file_id=iteration_id)

        else:
            best_iteration += 1
            
        it+=1
        print(f"Best Results: {best_iteration+1}/{20}\n\n")
        
    output.close()

    
    return



In [None]:
import datetime

for i in range(5):
    random_search(max_iteration=20, iteration_id=f"{str(datetime.datetime.now().date())}_{i}")