# Machine Learning and breast cancer
                                                  Nguyen Thi Huong An - Toulouse School of Economics
In this framework, we will explore the breast cancer data and carry out the classification using PySpark.

Breast cancer data source: https://archive.ics.uci.edu/ml/datasets/breast+cancer

I will concentrate on three Machine Learning algorithms to train data. They are Logistic Regression, Gradient Boost and Random Forest.

A accurate measurement will be carried out for each method. 

And I get that Random Forest gives us the highest accuracy betwen these three methods.

In [2]:
from pyspark.sql.functions import *
from pyspark.sql import functions as fn
from pyspark.sql.functions import mean, col, stddev
from pyspark.mllib.util import MLUtils
from pyspark.mllib.stat import Statistics
import pyspark.mllib.linalg
from pyspark.mllib.linalg import Matrix, Matrices
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.mllib.stat import KernelDensity
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel


import pandas as pd
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
from scipy.stats import norm
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
from string import ascii_letters
import scipy.stats as ss

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import PCA, VectorAssembler, StringIndexer
from pyspark.ml.feature import Normalizer

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline


from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.ensemble import  GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Loading the breast cancer dataset

In [4]:
# Read data file which is downloaded from this link: https://archive.ics.uci.edu/ml/datasets/breast+cancer
breastcancer = spark.read.format('csv').option("header","false").option("inferSchema","false").load('/FileStore/tables/breastcancer.csv')
# breastcancer = spark.read.csv('/FileStore/tables/breastcancer.csv', sep = ";", header = True, inferSchema = True)
display(breastcancer)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9
no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no
no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no
no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no


##### Columns Rename

In [6]:
# Add column names to the data set
breastcancer = breastcancer.withColumnRenamed("_c0", "label")
breastcancer = breastcancer.withColumnRenamed("_c1", "age")
breastcancer = breastcancer.withColumnRenamed("_c2", "menopause")
breastcancer = breastcancer.withColumnRenamed("_c3", "tumor_size")
breastcancer = breastcancer.withColumnRenamed("_c4", "inv_nodes")
breastcancer = breastcancer.withColumnRenamed("_c5", "node_caps")
breastcancer = breastcancer.withColumnRenamed("_c6", "deg_malig")
breastcancer = breastcancer.withColumnRenamed("_c7", "breast")
breastcancer = breastcancer.withColumnRenamed("_c8", "breast_quad")
breastcancer = breastcancer.withColumnRenamed("_c9", "irradiat") 

##### Let see structure of dataset

In [8]:
breastcancer.printSchema()

##### Verifying the abnormal value for categorical variables:

In [10]:
categorical = ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat', 'label']
for c in categorical:
  print(c, set(breastcancer.toPandas()[c]))
# another way
# for col in breastcancer.columns:
#     breastcancer.describe([col]).show()

In [11]:
# delete * values as an example
# data = sqlContext.createDataFrame([("Alberto", 2, '&'), ("?", 2,'?')], ["Name", "askdaosdka", "symbole"])
# data.drop("?").show()
# df_filtered=data.filter(data.Name != '?')
# df_filtered.show(5)

In [12]:
# delete ? values in our dataset
brcancer = breastcancer.filter(breastcancer.node_caps != '?')
brcancer = brcancer.filter(breastcancer.breast_quad != '?')

In [13]:
brcancer.count() # 277 observations
len(brcancer.columns) # 10 features

In [14]:
# remove all values of ?, but it does not work perfectly. To be continued
# listOfRelevantAbnormal = breastcancer.columns
# breastcancern = ' and '.join('(%s != "?")' % col_name for col_name in listOfRelevantAbnormal)
# breastcancer.filter(breastcancern)
# type(breastcancern)


In [15]:
#verify if all abnormal values are removed
categorical = ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat']
for c in categorical:
  print(c, set(brcancer.toPandas()[c]))

##### I would like to see the percentage of patients with recurrence and without recurrence

In [17]:
# CLASS
import pyspark.sql.functions as fn
from pyspark.sql.window import Window
nb_class = brcancer.groupBy('label').count()
nb_class = nb_class.withColumn("percent_recurrence", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_class.show()
# There are 70% of patients without recrurrence, and 30% of patients with recurrence.


##### Structure of variables

In [19]:
# AGE
nb_age = brcancer.groupBy('age').count()
nb_age = nb_age.withColumn("percent_age", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_age.show()
# The most of women got the breast cancer at [50-59], then [40-49]. It is rarely for the age of [20-29]. 
# However, the percentage of breast cancer increase considerably from 30 years old.

In [20]:
# TUMOR_SIZE
nb_tumor_size = brcancer.groupBy('tumor_size').count()
nb_tumor_size = nb_tumor_size.withColumn("percent_tumor_size", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_tumor_size.show()
# The most popular size of tumor is 30-34(21%), then 25-29(18,9%), then 20-24(17,5%). 
# The tumor size of 5-9 (small), 45-49 (big) are rare

In [21]:
# BREAST
nb_breast = brcancer.groupBy('breast').count()
nb_breast = nb_breast.withColumn("percent_breast", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_breast.show()
#THere are 53% percent of breast cancer on the left. It is not very different from the right one.

In [22]:
# BREAST_QUAD
nb_breast_quad = brcancer.groupBy('breast_quad').count()
nb_breast_quad = nb_breast_quad.withColumn("percent_breast_quad", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_breast_quad.show()
# THere are 38% of breast cancer on the low left, 34% on the up left.
# The percentage at the center is lowest (7,3%)

In [23]:
# DEG_MALIG
nb_deg_malig = brcancer.groupBy('deg_malig').count()
nb_deg_malig = nb_deg_malig.withColumn("percent_deg_malig", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_deg_malig.show()

In [24]:
# IRRADIATE (xa tri)
nb_irradiat = brcancer.groupBy('irradiat').count()
nb_irradiat = nb_irradiat.withColumn("percent_irradiat", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_irradiat.show()
# THere is 23,7% of patients who have to irradiate and 76,3% who have not to irradiate. 

In [25]:
# node_caps ()
nb_node_caps = brcancer.groupBy('node_caps').count()
nb_node_caps = nb_node_caps.withColumn("percent_node_caps", fn.col('count')/fn.sum('count').over(Window.partitionBy()))
nb_node_caps.show()
# THere are 8 missing values. 

##### Preparing Data for Machine Learning

In [27]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline, PipelineModel

##### From the data description, we can try to work with regression or classification.
##### It is curious about the impact of (age, breast_quad, breast, tumor_size, recurrence, DEG_MALIG) on the irradiation 
##### Or we can try an another model to see the impact of (age, breast_quad, breast, tumor_size, DEG_MALIG, irradiation) on the recurrence

In [29]:
res_target = (brcancer.groupBy("label").\
              agg(fn.count('label').alias('Headcounts'),
            ((fn.count('label')/int(brcancer.count()))*100).alias('Proportions')))

res_target = res_target.withColumn("Proportions", fn.round(res_target["Proportions"], 2))
res_target.registerTempTable("res_target")

display(sqlContext.sql("SELECT * FROM res_target"))

label,Headcounts,Proportions
no-recurrence-events,196,70.76
recurrence-events,81,29.24


In [30]:
labels = 'no-recurrence-events', 'recurrence-events'
sizes = [res_target.toPandas()['Headcounts'][0], 
         res_target.toPandas()['Headcounts'][1]]
colors = ['indianred', 'tomato']
explode = (0, 0.1) 
 
# Plot
fig, ax = plt.subplots()
ax = plt.pie(sizes, explode = explode, colors = colors, 
             autopct = '%1.1f%%', shadow = True, startangle = 140)
plt.axis('equal')
plt.title('Repartition of the recurrence-events and no-recurrence-events patients')
plt.legend(labels)

display(fig)

In [31]:
pandas_df = brcancer.select('*').toPandas()

categorical = ['age', 'menopause', 'tumor_size', 'inv_nodes',  
               'node_caps',  'deg_malig', 'breast', 'breast_quad',  
               'irradiat', 'label']
numerical = list(pandas_df.columns.difference(categorical)) + ['label']

numerical_df = pandas_df[numerical]
categorical_df = pandas_df[categorical]

We compute the correlations between the target variable and the categorical variables thanks to 'pd.factorize'.

In [33]:
corr_cat = pd.DataFrame(categorical_df.\
                       apply(lambda x : pd.factorize(x)[0]).\
                       corr(method='pearson', min_periods=1) )

display(corr_cat)

age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,label
1.0,0.5133684827972832,0.0745981662598067,-0.0600674418059776,-0.0119575920979294,0.0477937614097615,-0.0315024556259821,-0.0026619116402183,-0.1501490923274081,-0.1310788819204957
0.5133684827972832,1.0,-0.031069132625936,-0.0451812898075387,-0.0485690657785434,-0.0400203221846636,-0.0592174561921106,0.0235310309048088,-0.0934806263333815,-0.0874801507157749
0.0745981662598067,-0.031069132625936,1.0,-0.0871336306494342,-0.012161180908766,0.0546081842868766,-0.0014290245236046,-0.0916105570320319,0.0416115150471217,-0.1058757911222572
-0.0600674418059776,-0.0451812898075387,-0.0871336306494342,1.0,0.589656551135381,-0.323837626049929,0.0409922047898005,-0.0636217234451898,0.322762979978297,0.2944392316826555
-0.0119575920979294,-0.0485690657785434,-0.012161180908766,0.589656551135381,1.0,-0.3301808787343766,-0.0123450395005517,-0.0255446772819173,0.3335371681385202,0.289002465467917
0.0477937614097615,-0.0400203221846636,0.0546081842868766,-0.323837626049929,-0.3301808787343766,1.0,0.0458741099010686,0.0452507655365366,-0.2308213396338516,-0.3302528117153746
-0.0315024556259821,-0.0592174561921106,-0.0014290245236046,0.0409922047898005,-0.0123450395005517,0.0458741099010686,1.0,0.1523519029139105,0.0252301019088686,-0.0413038927733004
-0.0026619116402183,0.0235310309048088,-0.0916105570320319,-0.0636217234451898,-0.0255446772819173,0.0452507655365366,0.1523519029139105,1.0,-0.0389387472552224,-0.077246818485431
-0.1501490923274081,-0.0934806263333815,0.0416115150471217,0.322762979978297,0.3335371681385202,-0.2308213396338516,0.0252301019088686,-0.0389387472552224,1.0,0.226019645304459
-0.1310788819204957,-0.0874801507157749,-0.1058757911222572,0.2944392316826555,0.289002465467917,-0.3302528117153746,-0.0413038927733004,-0.077246818485431,0.226019645304459,1.0


If the correlation is equal or greater than 0.5 in absolute value, it means that the variables are strongly linked. This matrix is not reallu readable. We can plot the results:

In [35]:
fig, ax = plt.subplots()

cmap = sns.diverging_palette(220, 10, as_cmap=True)
mask = np.zeros_like(corr_cat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

ax = sns.heatmap(corr_cat, cmap=cmap, vmax=.3, center=0, mask=mask,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Correlation matrix between the target and the categorical variables')

display(fig)

The following variables are positively correlated with the target:
  - inv_nodes
  - node_caps
  - irradiat
  
  
The following variable are negatively correlated with the target:
  - menopause
  - age
  - deg_malig
  - breast_quad
  - tumor_size

## I. ** Logistic Regression Model **

### ** 1) Encoding of Categorical variable **

In our dataframe, we have only categorical features in it. In oder to input the features in our machine learning model, we have to transform all categorical attribute to the numeric ones by indexing them. Either it is our input features or our label column for the model, we have to do it to train our model.

For the input features of our model, name the categorical features and transformed them:

In [39]:
# https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv# Transform categorical variables: Method 1
# convert panda dataframe to spark dataframe: https://dataplatform.cloud.ibm.com/exchange/public/entry/view/5ad1c820f57809ddec9a040e37b2bd55
spark_df = sqlContext.createDataFrame(categorical_df)
# transform for 'label'
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexed = indexer.fit(brcancer).transform(brcancer)
indexed.show(5)
indexed.select('labelIndex').distinct().show()

In [40]:
# transform for categorical variables
cat_col_list=['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps',  'deg_malig', 'breast', 'breast_quad', 'irradiat']
for col in cat_col_list:
    indexer = StringIndexer(inputCol=col, outputCol=col + "Index")
    indexed = indexer.fit(indexed).transform(indexed)
indexed.show(5)

In the above lines of code, we just name those features that are categorical and transformed them into numeric variables. Remember, that we didn’t overwrite the features, instead, we created new attributes by concatenating the name of previous features and the string “Index”. So that we can input only those features that we need for the training of model and keep the real one intact.

### ** 2) Typecasting of Features **

In PySpark dataframe, we have to mention the data types of the continuous feature attribute. For all the numeric variable that are not discrete, we have to typecast them to later input them in a machine learning model. However, in this model, we do not have any numeric variable.

### ** 3) Assembling of Input Features **

In this step, we actually assemble all the features we need to input in a model. We have to provide the list of those transformed categorical attributes and make a vectored feature.

In [44]:
# AsssemblerVector or OneHotEncoder ?
# An example to understand how to assemble all features
# df = spark.createDataFrame([("a","France"), ("b", "US"), ("c", "UK")], ["age","Country"])
# indexers = [StringIndexer(inputCol = col, outputCol = col+"Index") for col in df.columns]
# pipeline = Pipeline(stages = indexers)
# in_df = pipeline.fit(df).transform(df)
# in_df.show()
# feature_listT = ['age', 'Country']
# vectorAssemblerT = VectorAssembler(inputCols=feature_listT, outputCol="features")                                 
# features_vectorizedT = vectorAssemblerT.transform(in_df)
# features_vectorizedT.show()

In [45]:
# try to delete columns of categorical
# categorical_no_label = ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps',  'deg_malig', 'breast', 'breast_quad', 'irradiat']
# ind_df_del = ind_df
# df = ind_df_del.drop(*categorical) # df contains on ly variableIndex
# df.show(5)

In [46]:
# from pyspark.ml.feature import VectorAssembler
features_list = ['ageIndex','menopauseIndex', 'tumor_sizeIndex','inv_nodesIndex','node_capsIndex', 
                'deg_maligIndex', 'breastIndex', 'breast_quadIndex', 'irradiatIndex']
vectorAssembler = VectorAssembler(inputCols=features_list,
                                  outputCol="features")

features_vectorized = vectorAssembler.transform(indexed)
features_vectorized.show(5)
# see feature columns
# features_vectorized.select("features").show()

### ** 4) Normalization of Input Features **

As we can observe that all of our input features are not on the same scale, so the recommended approach is to first normalize our input features then fed them into the model for the better result.

In [48]:
from pyspark.ml.feature import Normalizer

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
l1NormData = normalizer.transform(features_vectorized)
l1NormData.show(5)

In [49]:
# drop column with index except Label
df_combine = l1NormData
for col in features_list:
      df_combine = df_combine.drop(col)
        
df_combine.show(5)

In [50]:
# drop categorical variables
for col in cat_col_list:
      df_combine = df_combine.drop(col)
        
df_combine.show(5)

In [51]:
#drop Label and features
df_combine = df_combine.drop("features").drop("label")
df_combine.show(5)

### ** 5) Distribution of Dataset **

As we prepared our input features PySpark dataframe, now it is the right time to define our training and testing dataset to train our model on sufficient training dataset and then use unseen or test dataset to evaluate the performance of our Logistic Regression model later.

In [53]:
#Split the dataset into training and testing dataset
splits = df_combine.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

### ** 6) Configuration of the Logistic Regression Model **

Before building the machine learning pipeline, we have to make some configuration of our machine learning model using PySpark MLlib to define the structure of Logistic Regression with some initial model parameter. It’s the important step before establishing a machine learning pipeline.

In [55]:
# from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features_norm', labelCol = 'labelIndex', maxIter=10, regParam=0.3, elasticNetParam=0.8)

## ** 7) Definition of Machine Learning pipeline **

In the step of defining machine learning pipeline, we basically roll out all the stages we have prepared for the establishment of machine learning pipeline. For this purpose, we will take the instantiated logistic regression model and put this in our configured machine learning pipeline.

In [57]:
# from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[lr])

## ** 8) Train Logistic Regression Model **

For the training of our model, we will call the ‘fit’ function of a configured pipeline and then fed the training dataset as a function argument. Remember that this training dataset is different from the testing dataset, as we have split our dataset already.

In [59]:
model = pipeline.fit(df_train)

## ** 9) Prediction via Logistic Regression Model **

Once our logistic regression model is trained on training dataset, we can predict on the same training dataset that how well our model is performed on the training dataset. For the prediction, we have to call the ‘transform’ function on the trained model and then it will give us the prediction from the model.

In [61]:
prediction = model.transform(df_train)

In [62]:
prediction.printSchema()

In [63]:
prediction.show(5)

In [64]:
# Evaluation on train data
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
binEval.evaluate(prediction)

### ** 10) Evaluation of Testing Data **

As we have trained our model and got the simple prediction of the training dataset. Now it’s the right absolute time to evaluate our machine learning model on testing data. The testing will basically tell us that in general whether this model is too good or too bad on unseen data.

In [66]:
#test on testing data
prediction = model.transform(df_test)

In [67]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
binEval.evaluate(prediction)

##II. ** Gradient Boosting **

The second model we will perform is a Gradient Boosting. This is an aggregation model and we will see if it is better than the logistic regression.

In [69]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="labelIndex", featuresCol="features_norm", maxIter=10)


In [70]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gbt])

In [71]:
model = pipeline.fit(df_train)

In [72]:
prediction = model.transform(df_train)

In [73]:
prediction.printSchema()

In [74]:
prediction.show()

In [75]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
    
binEval.evaluate(prediction)

In [76]:
#test on testing data
prediction = model.transform(df_test)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
    
binEval.evaluate(prediction)

#### III. Random Forest

In [78]:
from pyspark.mllib.tree import RandomForest
from time import *

In [79]:
start_time = time()

#model = RandomForest.trainClassifier(df_train, numClasses=2, categoricalFeaturesInfo={}, \
#    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
#    maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)
# train RandomForest model
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="labelIndex", featuresCol="features_norm")


end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

In [80]:
pipeline = Pipeline(stages=[rf])

In [81]:
rf_model = pipeline.fit(df_train)

In [82]:
# Make predictions on train data
predictions = rf_model.transform(df_train)

In [83]:
predictions.printSchema()

In [84]:
predictions.show()

In [85]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
binEval.evaluate(prediction)

In [86]:
#test on test data
prediction = rf_model.transform(df_test)

Accuracy on testing data

In [88]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("labelIndex")
binEval.evaluate(prediction)

# Conclusion:

We ran three different models: Logistic model, Gradient Boosting and Random Forest.

We got that Random Forest gives the highest accuracy for these data.