In [51]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [52]:
training = spark.read.option("inferSchema","true").csv("Planet_Training.csv",header=True)
testing = spark.read.option("inferSchema","true").csv("Planet_Testing.csv",header=True)

In [54]:
training = training.select("Temperature","Water","Atmosphere Color","Habitable")
testing = testing.select("Temperature","Water","Atmosphere Color","Habitable")

In [55]:
training = training.na.drop()
testing = testing.na.drop()

In [57]:
from pyspark.sql.functions import when

training = training.withColumn("Water",when(training["Water"] == "Low",0)
                                                .when(training["Water"] == "Medium",1)
                                                .otherwise(2))

testing = testing.withColumn("Water",when(testing["Water"] == "Low",0)
                                                .when(testing["Water"] == "Medium",1)
                                                .otherwise(2))

training = training.withColumn("Atmosphere Color",when(training["Atmosphere Color"] == "Red",0)
                                                .when(training["Atmosphere Color"] == "Blue",1)
                                                .otherwise(2))

testing = testing.withColumn("Atmosphere Color",when(testing["Atmosphere Color"] == "Red",0)
                                                .when(testing["Atmosphere Color"] == "Blue",1)
                                                .otherwise(2))

In [58]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

cols = training.columns
cols.remove("Habitable")

training = VectorAssembler(inputCols=cols, outputCol="Features").transform(training)
testing = VectorAssembler(inputCols=cols, outputCol="Features").transform(testing)

In [59]:
scaler = StandardScaler(inputCol="Features",outputCol="Scaled Features")
training = scaler.fit(training).transform(training)
testing = scaler.fit(testing).transform(testing)

In [60]:
from pyspark.ml.classification import LogisticRegression

model = LogisticRegression(featuresCol="Scaled Features",labelCol="Habitable",maxIter=10).fit(training)
prediction = model.transform(testing)

In [61]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Habitable")
result = evaluator.evaluate(prediction)
print("Accuracy: {}%".format(result * 100))

Accuracy: 91.71043337232418%
