In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from pyspark.sql.functions import when
import pandas as pd 

In [None]:
config = SparkConf()
spark = SparkSession.builder.master("local").appName("test").config(conf=config).getOrCreate()

In [None]:
# Preparing the train
train = spark.read.option("header", "true").csv(r"data\train.csv")
# filtre et supprime les donn√©es "NA"
train = train.filter(train.Age != "NA")
train = train.withColumn("Gender", when(train.Sex == "male","1").when(train.Sex == "female","2"))
# Supprime les colonnes inutiles
cols = ('SibSp', 'Parch', 'Fare', 'Ticket' ,'Cabin', 'Embarked', 'Name', 'Sex')
train = train.drop(*cols)

train.show(10)

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType

train = train.withColumn("PassengerId", train["PassengerId"].cast('float'))
train = train.withColumn("Survived" ,train["Survived"].cast('float'))
train = train.withColumn("Pclass" ,train["Pclass"].cast('float'))
train = train.withColumn("Age" ,train["Age"].cast(('float')))
train = train.withColumn("Gender" ,train["Gender"].cast(('float')))
train.printSchema()
features = ['PassengerId','Survived', 'Pclass','Age', 'Gender']
va = VectorAssembler(inputCols = features, outputCol='features')
va_df = va.transform(train)
va_df.show(3)

In [None]:
(train_, test) = va_df.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml.feature import StringIndexer

dtc = DecisionTreeClassifier(featuresCol="features", labelCol="PassengerId")

indexer = StringIndexer().setInputCol("PassengerId").setOutputCol("label_idx").fit(train_)

prediction = dtc.setLabelCol("label_idx").fit(indexer.transform(train_))

prediction

In [None]:
#evaluator = BinaryClassificationEvaluator(labelCol='Survived')
#accuraty = evaluator.evaluate(prediction, {evaluator.metricName: "areaUnderROC"})
#auprc = evaluator.evaluate(prediction, {evaluator.metricName: "areaUnderPR"})
#print("Area under ROC Curve: {:.4f}".format(accuraty))
#print("Area under PR Curve: {:.4f}".format(auprc))

#evaluator=BinaryClassificationEvaluator(rawPredictionCol="features",labelCol="Survived")
#predictionAndLabels = test.map(lambda lp: (float(prediction.predict(lp.features)), lp.label))# Instantiate metrics object
#accuracy=evaluator.evaluate(predictionAndLabels.prediction)
#print("accuracy : ",accuracy*100," %")



#metrics = BinaryClassificationMetrics(predictionAndLabels.prediction)# Area under precision-recall curve
#print("Area under PR = %s" % metrics.areaUnderPR)# Area under ROC curve
#print("Area under ROC = %s" % metrics.areaUnderROC)


# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(train)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)
# $example off$

spark.stop()

