## 1. Load the data into Spark

In [2]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark.ml.linalg import VectorUDT,Vectors

def toFloat(x):
    if x == '?':
        return 5.0
    else:
        return float(x)

def doLine(l):
    item=l.split(",")
    label = 1
    if item[10]=='2':
        label=0
    return (Vectors.dense([toFloat(e) for e in item[1:10]]),label)
  
path = "/FileStore/tables/"
raw_data = sc.textFile(path+"/breast_cancer_wisconsin-2f6e5.data")
schema = StructType([StructField("features", VectorUDT(), True),
                     StructField("label",IntegerType(),True)])
data = SQLContext(sc).createDataFrame(raw_data.map(doLine),schema)

### Explain what the provided code does.
The code provided firstly read the file, then it take the 9 features into a vector and fill the empty values with 5, then it convert the original two classes *2* and *4* into *0* and *1* for each line, and finally convert it into a dataframe.

### What does data looks like?

In [5]:
data.show()

### What is the schema of the *data*?
The *data* is composed by *n* vectors of features and a label vector of size *n*

In [7]:
data.printSchema()

### In our dataset, how many tumors are benign? malign?

In [9]:
data.groupBy("label").count().show()

So, 458 timors are belign, 241 tumors are malign.

## 2 Splitting into training and testing

In [12]:
train, test = data.randomSplit([0.9, 0.1])

In [13]:
train.count()

In [14]:
test.count()

We can see from the size than the set is well splitted.

## 3 Testing your model

### Include the relevant packages with the following code snippet :

In [18]:
from pyspark.ml.classification import DecisionTreeClassifier

### Train a model called bc_model with your training data !

In [20]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
bc_model = dt.fit(train)

## 4 Testing your model

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = bc_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName='areaUnderROC')
result = evaluator.evaluate(predictions)
print(result)

### What does the DataFrame prediction contain?

In [24]:
predictions.show()

The prediction contains the contents from the **test** set, the **rawPrediction** which represents the weights for the prediction of each class calculated by the model we trained, **probability** for each class calculated form rawPrediction, and the final prediction.

### What is the area under ROC of our classifier?

In [27]:
print(result)

### What is the accuracy of our classifier?

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator1.evaluate(predictions)
print(accuracy)

## 5 Improving the model

### Use train-validation to determine a good set of parameters.

In [32]:
from pyspark.ml.tuning import TrainValidationSplit

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxBins, [10, 32, 50, 100]) \
    .addGrid(dt.maxDepth, [3, 5, 10, 20]) \
    .build()

tvs = TrainValidationSplit(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          trainRatio=0.8)

model = tvs.fit(train)

In [33]:
model.bestModel._java_obj.getMaxDepth()

In [34]:
model.bestModel._java_obj.getMaxBins()

### Use cross-validation to determine a good set of parameters.

In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxBins, [10, 32, 50, 100]) \
    .addGrid(dt.maxDepth, [3, 5, 10, 20]) \
    .build()

crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5)

cvModel = crossval.fit(train)
print(cvModel.bestModel)

In [37]:
cvModel.bestModel._java_obj.getMaxDepth()

In [38]:
cvModel.bestModel._java_obj.getMaxBins()

So in the parameters I tested, the best parameter set is `(maxDepth=10, maxBins=10)`

### What is the area under *ROC* of your model now?

In [41]:
dt1 = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxDepth=10, maxBins=10)
bc_model1 = dt1.fit(train)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName='areaUnderROC')
prediction_c = bc_model1.transform(test)
result_c = evaluator.evaluate(prediction_c)
print(result_c)

The *area under ROC* is now 0.9153488372093023, which is larger than 0.8851162790697674 of the former section.

## 6 OPTIONAL

### Different models for the Breast Cancer task

Logistic Regression

In [46]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)
prediction_l = lrModel.transform(test)
result_l = evaluator.evaluate(prediction_l)
print(result_l)

Random Forest

In [48]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rfModel = rf.fit(train)
prediction_r = rfModel.transform(test)
result_r = evaluator.evaluate(prediction_r)
print(result_r)

Gradient-boosted tree classifier

In [50]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
gbtModel = gbt.fit(train)
prediction_g = gbtModel.transform(test)
result_g = evaluator.evaluate(prediction_g)
print(result_g)

We can see the results for those models are all better than the simple Decision Tree Classifier.