## Import base modules

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_horse = pd.read_csv('../data/horse.csv')

## Data

In [3]:
df_horse.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


## Count null

In [4]:
dict_col_null_count = {}

In [5]:
for column in df_horse.columns:
    dict_col_null_count[column] = len(df_horse) - df_horse[column].isnull().sum()

In [6]:
dict_col_null_count

{'surgery': 299,
 'age': 299,
 'hospital_number': 299,
 'rectal_temp': 239,
 'pulse': 275,
 'respiratory_rate': 241,
 'temp_of_extremities': 243,
 'peripheral_pulse': 230,
 'mucous_membrane': 252,
 'capillary_refill_time': 267,
 'pain': 244,
 'peristalsis': 255,
 'abdominal_distention': 243,
 'nasogastric_tube': 195,
 'nasogastric_reflux': 193,
 'nasogastric_reflux_ph': 53,
 'rectal_exam_feces': 197,
 'abdomen': 181,
 'packed_cell_volume': 270,
 'total_protein': 266,
 'abdomo_appearance': 134,
 'abdomo_protein': 101,
 'outcome': 299,
 'surgical_lesion': 299,
 'lesion_1': 299,
 'lesion_2': 299,
 'lesion_3': 299,
 'cp_data': 299}

## Drop columns

* hospital_number: Too many unique values (and the numeric order does not have any meaning)
* cp_data: Accroding to the data-description, this variable is of no significance

In [7]:
df_horse.drop(['hospital_number','cp_data'], axis = 1, inplace=True)

## lesion_1, 2, 3 (int to str)
* It should be understanded as a str

In [8]:
df_horse['lesion_1'] = df_horse['lesion_1'].astype(str)
df_horse['lesion_2'] = df_horse['lesion_2'].astype(str)
df_horse['lesion_3'] = df_horse['lesion_3'].astype(str)

## Correlation

### Make target (y) for calculating correlation

In [9]:
from sklearn import preprocessing

#### Converted y values
* Lived: 1
* euthanized: 0
* Died: -1

In [10]:
le = preprocessing.LabelEncoder()
le.fit(df_horse['outcome'])
y = le.transform(df_horse['outcome']) - 1
df_y = pd.DataFrame(y)

### To check whether column is numeric or string type

In [11]:
lst_str_columns = []
lst_num_columns = []

dict_column_median_mode = {}

for column in df_horse.columns:
    df_horse_drop_nan_row = df_horse[column].dropna()
    
    if isinstance(df_horse_drop_nan_row.values[0], str):
        lst_str_columns.append(column)
    else:
        lst_num_columns.append(column)

### Compute correlation 

In [12]:
dict_corr = {}

for column in df_horse.columns:
    if column in lst_num_columns:
        dict_corr[column] = df_horse[column].corr(df_y[0])
        
dict_corr

{'rectal_temp': 0.019362036095150942,
 'pulse': -0.36623524855643347,
 'respiratory_rate': -0.08995249757566348,
 'nasogastric_reflux_ph': -0.19609883441813056,
 'packed_cell_volume': -0.42170275805152185,
 'total_protein': 0.26320903881958424,
 'abdomo_protein': -0.22581153821768585}

In [13]:
import operator

dict_corr_sorted = sorted(dict_corr.items(), key = lambda kv: abs(kv[1]))
dict_corr_sorted

[('rectal_temp', 0.019362036095150942),
 ('respiratory_rate', -0.08995249757566348),
 ('nasogastric_reflux_ph', -0.19609883441813056),
 ('abdomo_protein', -0.22581153821768585),
 ('total_protein', 0.26320903881958424),
 ('pulse', -0.36623524855643347),
 ('packed_cell_volume', -0.42170275805152185)]

## Impute using KNN

### KNN Impute function

In [14]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.stats import hmean
from scipy.spatial.distance import cdist
from scipy import stats
import numbers


def weighted_hamming(data):
    """ Compute weighted hamming distance on categorical variables. For one variable, it is equal to 1 if
        the values between point A and point B are different, else it is equal the relative frequency of the
        distribution of the value across the variable. For multiple variables, the harmonic mean is computed
        up to a constant factor.
        @params:
            - data = a pandas data frame of categorical variables
        @returns:
            - distance_matrix = a distance matrix with pairwise distance for all attributes
    """
    categories_dist = []
    
    for category in data:
        X = pd.get_dummies(data[category])
        X_mean = X * X.mean()
        X_dot = X_mean.dot(X.transpose())
        X_np = np.asarray(X_dot.replace(0,1,inplace=False))
        categories_dist.append(X_np)
    categories_dist = np.array(categories_dist)
    distances = hmean(categories_dist, axis=0)
    return distances


def distance_matrix(data, numeric_distance = "euclidean", categorical_distance = "jaccard"):
    """ Compute the pairwise distance attribute by attribute in order to account for different variables type:
        - Continuous
        - Categorical
        For ordinal values, provide a numerical representation taking the order into account.
        Categorical variables are transformed into a set of binary ones.
        If both continuous and categorical distance are provided, a Gower-like distance is computed and the numeric
        variables are all normalized in the process.
        If there are missing values, the mean is computed for numerical attributes and the mode for categorical ones.
        
        Note: If weighted-hamming distance is chosen, the computation time increases a lot since it is not coded in C 
        like other distance metrics provided by scipy.
        @params:
            - data                  = pandas dataframe to compute distances on.
            - numeric_distances     = the metric to apply to continuous attributes.
                                      "euclidean" and "cityblock" available.
                                      Default = "euclidean"
            - categorical_distances = the metric to apply to binary attributes.
                                      "jaccard", "hamming", "weighted-hamming" and "euclidean"
                                      available. Default = "jaccard"
        @returns:
            - the distance matrix
    """
    possible_continuous_distances = ["euclidean", "cityblock"]
    possible_binary_distances = ["euclidean", "jaccard", "hamming", "weighted-hamming"]
    number_of_variables = data.shape[1]
    number_of_observations = data.shape[0]

    # Get the type of each attribute (Numeric or categorical)
    is_numeric = [all(isinstance(n, numbers.Number) for n in data.iloc[:, i]) for i, x in enumerate(data)]
    is_all_numeric = sum(is_numeric) == len(is_numeric)
    is_all_categorical = sum(is_numeric) == 0
    is_mixed_type = not is_all_categorical and not is_all_numeric

    # Check the content of the distances parameter
    if numeric_distance not in possible_continuous_distances:
        print ("The continuous distance " + numeric_distance + " is not supported.")
        return None
    elif categorical_distance not in possible_binary_distances:
        print ("The binary distance " + categorical_distance + " is not supported.")
        return None

    # Separate the data frame into categorical and numeric attributes and normalize numeric data
    if is_mixed_type:
        number_of_numeric_var = sum(is_numeric)
        number_of_categorical_var = number_of_variables - number_of_numeric_var
        data_numeric = data.iloc[:, is_numeric]
        data_numeric = (data_numeric - data_numeric.mean()) / (data_numeric.max() - data_numeric.min())
        data_categorical = data.iloc[:, [not x for x in is_numeric]]

    # Replace missing values with column mean for numeric values and mode for categorical ones. With the mode, it
    # triggers a warning: "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame"
    # but the value are properly replaced
    if is_mixed_type:
        data_numeric.fillna(data_numeric.mean(), inplace=True)
        for x in data_categorical:
            data_categorical[x].fillna(data_categorical[x].mode()[0], inplace=True)
    elif is_all_numeric:
        data.fillna(data.mean(), inplace=True)
    else:
        for x in data:
            data[x].fillna(data[x].mode()[0], inplace=True)

    # "Dummifies" categorical variables in place
    if not is_all_numeric and not (categorical_distance == 'hamming' or categorical_distance == 'weighted-hamming'):
        if is_mixed_type:
            data_categorical = pd.get_dummies(data_categorical)
        else:
            data = pd.get_dummies(data)
    elif not is_all_numeric and categorical_distance == 'hamming':
        if is_mixed_type:
            data_categorical = pd.DataFrame([pd.factorize(data_categorical[x])[0] for x in data_categorical]).transpose()
        else:
            data = pd.DataFrame([pd.factorize(data[x])[0] for x in data]).transpose()

    if is_all_numeric:
        result_matrix = cdist(data, data, metric=numeric_distance)
    elif is_all_categorical:
        if categorical_distance == "weighted-hamming":
            result_matrix = weighted_hamming(data)
        else:
            result_matrix = cdist(data, data, metric=categorical_distance)
    else:
        result_numeric = cdist(data_numeric, data_numeric, metric=numeric_distance)
        if categorical_distance == "weighted-hamming":
            result_categorical = weighted_hamming(data_categorical)
        else:
            result_categorical = cdist(data_categorical, data_categorical, metric=categorical_distance)
        result_matrix = np.array([[1.0*(result_numeric[i, j] * number_of_numeric_var + result_categorical[i, j] *
                               number_of_categorical_var) / number_of_variables for j in range(number_of_observations)] for i in range(number_of_observations)])

    # Fill the diagonal with NaN values
    np.fill_diagonal(result_matrix, np.nan)

    return pd.DataFrame(result_matrix)


def knn_impute(target, attributes, k_neighbors, aggregation_method="mean", numeric_distance="euclidean",
               categorical_distance="jaccard", missing_neighbors_threshold = 0.5):
    """ Replace the missing values within the target variable based on its k nearest neighbors identified with the
        attributes variables. If more than 50% of its neighbors are also missing values, the value is not modified and
        remains missing. If there is a problem in the parameters provided, returns None.
        If to many neighbors also have missing values, leave the missing value of interest unchanged.
        @params:
            - target                        = a vector of n values with missing values that you want to impute. The length has
                                              to be at least n = 3.
            - attributes                    = a data frame of attributes with n rows to match the target variable
            - k_neighbors                   = the number of neighbors to look at to impute the missing values. It has to be a
                                              value between 1 and n.
            - aggregation_method            = how to aggregate the values from the nearest neighbors (mean, median, mode)
                                              Default = "mean"
            - numeric_distances             = the metric to apply to continuous attributes.
                                              "euclidean" and "cityblock" available.
                                              Default = "euclidean"
            - categorical_distances         = the metric to apply to binary attributes.
                                              "jaccard", "hamming", "weighted-hamming" and "euclidean"
                                              available. Default = "jaccard"
            - missing_neighbors_threshold   = minimum of neighbors among the k ones that are not also missing to infer
                                              the correct value. Default = 0.5
        @returns:
            target_completed        = the vector of target values with missing value replaced. If there is a problem
                                      in the parameters, return None
    """

    # Get useful variables
    possible_aggregation_method = ["mean", "median", "mode"]
    number_observations = len(target)
    is_target_numeric = all(isinstance(n, numbers.Number) for n in target)

    # Check for possible errors
    if number_observations < 3:
        print ("Not enough observations.")
        return None
    if attributes.shape[0] != number_observations:
        print ("The number of observations in the attributes variable is not matching the target variable length.")
        return None
    if k_neighbors > number_observations or k_neighbors < 1:
        print ("The range of the number of neighbors is incorrect.")
        return None
    if aggregation_method not in possible_aggregation_method:
        print ("The aggregation method is incorrect.")
        return None
    if not is_target_numeric and aggregation_method != "mode":
        print ("The only method allowed for categorical target variable is the mode.")
        return None

    # Make sure the data are in the right format
    target = pd.DataFrame(target)
    attributes = pd.DataFrame(attributes)

    # Get the distance matrix and check whether no error was triggered when computing it
    distances = distance_matrix(attributes, numeric_distance, categorical_distance)
    if distances is None:
        return None

    # Get the closest points and compute the correct aggregation method
    for i, value in enumerate(target.iloc[:, 0]):
        if pd.isnull(value):
            order = distances.iloc[i,:].values.argsort()[:k_neighbors]
            closest_to_target = target.iloc[order, :]
            missing_neighbors = [x for x  in closest_to_target.isnull().iloc[:, 0]]
            # Compute the right aggregation method if at least more than 50% of the closest neighbors are not missing
            if sum(missing_neighbors) >= missing_neighbors_threshold * k_neighbors:
                continue
            elif aggregation_method == "mean":
                target.iloc[i] = np.ma.mean(np.ma.masked_array(closest_to_target,np.isnan(closest_to_target)))
            elif aggregation_method == "median":
                target.iloc[i] = np.ma.median(np.ma.masked_array(closest_to_target,np.isnan(closest_to_target)))
            else:
                target.iloc[i] = stats.mode(closest_to_target, nan_policy='omit')[0][0]

    return target

### Impute two columns using KNN
#### Below columns have too many NaN but show considerable correlation
* nasogastric_reflux_ph: 53 / 299 (not nan data / all data)
* abdomo_protein: 101 / 299

In [15]:
df_horse['nasogastric_reflux_ph'] = knn_impute(target=df_horse['nasogastric_reflux_ph'], attributes=df_horse.drop(['nasogastric_reflux_ph', 'abdomo_protein'], 1),
                                    aggregation_method="median", k_neighbors=5, numeric_distance='euclidean',
                                    categorical_distance='hamming', missing_neighbors_threshold=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
df_horse['abdomo_protein'] = knn_impute(target=df_horse['abdomo_protein'], attributes=df_horse.drop(['nasogastric_reflux_ph', 'abdomo_protein'], 1),
                                    aggregation_method="median", k_neighbors=5, numeric_distance='euclidean',
                                    categorical_distance='hamming', missing_neighbors_threshold=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


## Get median & mode
* String: mode
* Numeric: median

In [17]:
df_horse.columns

Index(['surgery', 'age', 'rectal_temp', 'pulse', 'respiratory_rate',
       'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane',
       'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention',
       'nasogastric_tube', 'nasogastric_reflux', 'nasogastric_reflux_ph',
       'rectal_exam_feces', 'abdomen', 'packed_cell_volume', 'total_protein',
       'abdomo_appearance', 'abdomo_protein', 'outcome', 'surgical_lesion',
       'lesion_1', 'lesion_2', 'lesion_3'],
      dtype='object')

In [18]:
dict_column_median_mode = {}

for column in df_horse.columns:
    if column in lst_str_columns:
        dict_column_median_mode[column] = df_horse[column].mode()[0]
    else:
        dict_column_median_mode[column] = df_horse[column].median()

In [19]:
dict_column_median_mode

{'surgery': 'yes',
 'age': 'adult',
 'rectal_temp': 38.2,
 'pulse': 64.0,
 'respiratory_rate': 25.0,
 'temp_of_extremities': 'cool',
 'peripheral_pulse': 'normal',
 'mucous_membrane': 'normal_pink',
 'capillary_refill_time': 'less_3_sec',
 'pain': 'mild_pain',
 'peristalsis': 'hypomotile',
 'abdominal_distention': 'none',
 'nasogastric_tube': 'slight',
 'nasogastric_reflux': 'none',
 'nasogastric_reflux_ph': 0.0,
 'rectal_exam_feces': 'absent',
 'abdomen': 'distend_large',
 'packed_cell_volume': 45.0,
 'total_protein': 7.5,
 'abdomo_appearance': 'cloudy',
 'abdomo_protein': 2.5,
 'outcome': 'lived',
 'surgical_lesion': 'yes',
 'lesion_1': '0',
 'lesion_2': '0',
 'lesion_3': '0'}

## Fill Nan with median & mode
* String: mode
* Numeric: median

In [20]:
for column in df_horse.columns:
    df_horse[column].fillna(dict_column_median_mode[column], inplace = True)

In [21]:
df_horse.head(20)

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3
0,no,adult,38.5,66.0,28.0,cool,reduced,normal_pink,more_3_sec,extreme_pain,...,distend_large,45.0,8.4,cloudy,2.6,died,no,11300,0,0
1,yes,adult,39.2,88.0,20.0,cool,normal,pale_cyanotic,less_3_sec,mild_pain,...,other,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0
2,no,adult,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,...,normal,33.0,6.7,cloudy,7.0,lived,no,0,0,0
3,yes,young,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,...,distend_large,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0
4,no,adult,37.3,104.0,35.0,cool,normal,dark_cyanotic,more_3_sec,mild_pain,...,distend_large,74.0,7.4,cloudy,0.0,died,no,4300,0,0
5,no,adult,38.2,64.0,25.0,warm,normal,pale_pink,less_3_sec,depressed,...,firm,45.0,7.5,cloudy,2.3,lived,no,0,0,0
6,yes,adult,37.9,48.0,16.0,normal,normal,normal_pink,less_3_sec,mild_pain,...,distend_large,37.0,7.0,cloudy,4.4,lived,yes,3124,0,0
7,yes,adult,38.2,60.0,25.0,cool,normal,normal_pink,less_3_sec,mild_pain,...,distend_small,44.0,8.3,cloudy,5.95,died,yes,2208,0,0
8,no,adult,38.2,80.0,36.0,cool,absent,pale_pink,less_3_sec,severe_pain,...,distend_large,38.0,6.2,cloudy,3.4,euthanized,yes,3205,0,0
9,no,young,38.3,90.0,25.0,normal,normal,normal_pink,less_3_sec,extreme_pain,...,distend_large,40.0,6.2,clear,2.2,lived,no,0,0,0


## Check if everything is not nan

In [22]:
dict_col_null_count = {}

In [23]:
for column in df_horse.columns:
    dict_col_null_count[column] = len(df_horse) - df_horse[column].isnull().sum()

In [24]:
dict_col_null_count

{'surgery': 299,
 'age': 299,
 'rectal_temp': 299,
 'pulse': 299,
 'respiratory_rate': 299,
 'temp_of_extremities': 299,
 'peripheral_pulse': 299,
 'mucous_membrane': 299,
 'capillary_refill_time': 299,
 'pain': 299,
 'peristalsis': 299,
 'abdominal_distention': 299,
 'nasogastric_tube': 299,
 'nasogastric_reflux': 299,
 'nasogastric_reflux_ph': 299,
 'rectal_exam_feces': 299,
 'abdomen': 299,
 'packed_cell_volume': 299,
 'total_protein': 299,
 'abdomo_appearance': 299,
 'abdomo_protein': 299,
 'outcome': 299,
 'surgical_lesion': 299,
 'lesion_1': 299,
 'lesion_2': 299,
 'lesion_3': 299}

## String to get_dummies (One-hot!)

In [25]:
df_horse_one_hot = pd.get_dummies(df_horse.drop(columns = ['outcome']))

In [26]:
df_horse_one_hot.head()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,surgery_no,surgery_yes,age_adult,...,lesion_1_9000,lesion_1_9400,lesion_2_0,lesion_2_1400,lesion_2_3111,lesion_2_3112,lesion_2_6112,lesion_2_7111,lesion_3_0,lesion_3_2209
0,38.5,66.0,28.0,0.0,45.0,8.4,2.6,1,0,1,...,0,0,1,0,0,0,0,0,1,0
1,39.2,88.0,20.0,0.0,50.0,85.0,2.0,0,1,1,...,0,0,1,0,0,0,0,0,1,0
2,38.3,40.0,24.0,0.0,33.0,6.7,7.0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
3,39.1,164.0,84.0,5.0,48.0,7.2,5.3,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,37.3,104.0,35.0,0.0,74.0,7.4,0.0,1,0,1,...,0,0,1,0,0,0,0,0,1,0


## Z normalization for numeric columns

In [27]:
X = df_horse_one_hot

In [28]:
X[lst_num_columns] = (X[lst_num_columns] - X[lst_num_columns].mean()) / X[lst_num_columns].std()

In [29]:
X.head()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,surgery_no,surgery_yes,age_adult,...,lesion_1_9000,lesion_1_9400,lesion_2_0,lesion_2_1400,lesion_2_3111,lesion_2_3112,lesion_2_6112,lesion_2_7111,lesion_3_0,lesion_3_2209
0,0.495667,-0.194445,-0.087579,-0.711584,-0.11897,-0.532463,-0.113018,1,0,1,...,0,0,1,0,0,0,0,0,1,0
1,1.562983,0.603969,-0.587553,-0.711584,0.384885,2.376074,-0.377172,0,1,1,...,0,0,1,0,0,0,0,0,1,0
2,0.19072,-1.138025,-0.337566,-0.711584,-1.328222,-0.597012,1.824108,1,0,1,...,0,0,1,0,0,0,0,0,1,0
3,1.410509,3.362127,3.412241,1.508113,0.183343,-0.578027,1.075673,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,-1.334017,1.184634,0.349898,-0.711584,2.803388,-0.570433,-1.257683,1,0,1,...,0,0,1,0,0,0,0,0,1,0


## Run ML

In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [31]:
names = ["OneR", "Nearest Neighbors", "Linear SVM",
         "Decision Tree", "Random Forest", "Neural Net (MLP)", "AdaBoost"]

In [32]:
classifiers = [
    DecisionTreeClassifier(max_depth=1),
    KNeighborsClassifier(4),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(max_depth=8, n_estimators=300),
    MLPClassifier(alpha=1, hidden_layer_sizes = (100, 50)),
    AdaBoostClassifier(n_estimators = 500),
    ]

## CV

In [33]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_fscore_support

In [34]:
evaluations = []

for name, clf in zip(names, classifiers):
    scores = cross_validate(clf, X, y, cv = 10)
    y_pred = cross_val_predict(clf, X, y, cv = 10)
    precision_recall_f1 = precision_recall_fscore_support(y, y_pred, average='weighted')
    evaluations.append([name, scores['test_score'].mean(), precision_recall_f1[:3]])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Result

In [35]:
evaluations.sort(key = lambda evaluations: evaluations[1])
evaluations.reverse()

In [36]:
for evaluation in evaluations:
    print("-----" * 5)
    print(evaluation[0])
    print("Accuracy: ", evaluation[1])
    if len(evaluation) > 2: print("Precision, Recall, F1", evaluation[2])
    print("-----" * 5, end = "\n\n")

-------------------------
Neural Net (MLP)
Accuracy:  0.7662002754383177
Precision, Recall, F1 (0.751902584687805, 0.7625418060200669, 0.7537836679525971)
-------------------------

-------------------------
Linear SVM
Accuracy:  0.7087814502886806
Precision, Recall, F1 (0.6031378187507929, 0.7090301003344481, 0.6515823720792664)
-------------------------

-------------------------
Nearest Neighbors
Accuracy:  0.7059012659568833
Precision, Recall, F1 (0.683583540666541, 0.705685618729097, 0.6851134705737078)
-------------------------

-------------------------
Random Forest
Accuracy:  0.7056780020128184
Precision, Recall, F1 (0.7160949880725477, 0.7157190635451505, 0.6844993501179396)
-------------------------

-------------------------
AdaBoost
Accuracy:  0.7006096721224642
Precision, Recall, F1 (0.6885532137929803, 0.6956521739130435, 0.6915480655525187)
-------------------------

-------------------------
Decision Tree
Accuracy:  0.6517919381323163
Precision, Recall, F1 (0.653621681