In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
spark = SparkSession.builder.getOrCreate()

In [25]:
train_df = spark.read.option("inferschema", "true").csv("BigFootTraining.csv", header=True)
test_df = spark.read.option("inferschema", "true").csv("BigFootTesting.csv", header=True)

train_df = train_df.select("fur_color", "weight", "eye_color", "prehistoric")
test_df = test_df.select("fur_color", "weight", "eye_color", "prehistoric")

train_df= train_df.na.drop()
test_df= test_df.na.drop()

train_df =  train_df.withColumn("fur_color",when (train_df["fur_color"] == "orange", 0).
                                            when (train_df["fur_color"] == "red", 1).
                                            when (train_df["fur_color"] == "calico", 2).
                                            when (train_df["fur_color"] == "black", 3).
                                            when (train_df["fur_color"] == "white", 4).
                                            when (train_df["fur_color"] == "grey", 5).
                                            when (train_df["fur_color"] == "brown", 6).
                                            when (train_df["fur_color"] == "dark grey", 7))

train_df =  train_df.withColumn("eye_color",when (train_df["eye_color"] == "dark brown", 0).
                                            when (train_df["eye_color"] == "green", 1).
                                            when (train_df["eye_color"] == "blue", 2).
                                            when (train_df["eye_color"] == "brown", 3).
                                            when (train_df["eye_color"] == "black", 4)) 

train_df =  train_df.withColumn("prehistoric",when (train_df["prehistoric"] == "True", 0).
                                            when (train_df["prehistoric"] == "False", 1))

train_df = train_df.withColumn("weight", regexp_replace(train_df["weight"], " kg", "").cast("int"))

test_df =  test_df.withColumn("fur_color",when (test_df["fur_color"] == "orange", 0).
                                            when (test_df["fur_color"] == "red", 1).
                                            when (test_df["fur_color"] == "calico", 2).
                                            when (test_df["fur_color"] == "black", 3).
                                            when (test_df["fur_color"] == "white", 4).
                                            when (test_df["fur_color"] == "grey", 5).
                                            when (test_df["fur_color"] == "brown", 6).
                                            when (test_df["fur_color"] == "dark grey", 7))

test_df =  test_df.withColumn("eye_color",when (test_df["eye_color"] == "dark brown", 0).
                                            when (test_df["eye_color"] == "green", 1).
                                            when (test_df["eye_color"] == "blue", 2).
                                            when (test_df["eye_color"] == "brown", 3).
                                            when (test_df["eye_color"] == "black", 4)) 

test_df =  test_df.withColumn("prehistoric",when (test_df["prehistoric"] == "True", 0).
                                            when (test_df["prehistoric"] == "False", 1))

test_df = test_df.withColumn("weight", regexp_replace(test_df["weight"], " kg", "").cast("int"))

col = train_df.columns
col.remove("prehistoric")

train_df = VectorAssembler(inputCols=col, outputCol="features").transform(train_df)
train_df = StandardScaler(inputCol="features", outputCol="scaled_feature").fit(train_df).transform(train_df)

test_df = VectorAssembler(inputCols=col, outputCol="features").transform(test_df)
test_df = StandardScaler(inputCol="features", outputCol="scaled_feature").fit(test_df).transform(test_df)

model = LogisticRegression(featuresCol="scaled_feature", labelCol="prehistoric", maxIter=10).fit(train_df)

prediction = model.transform(test_df)
prediction.select("prehistoric", "prediction").show(10)

evaluator = BinaryClassificationEvaluator(labelCol="prehistoric")
print("Accuracy = {}%".format(evaluator.evaluate("prediction")* 100))

+-----------+----------+
|prehistoric|prediction|
+-----------+----------+
|          1|       1.0|
|          0|       0.0|
|          0|       0.0|
|          1|       1.0|
|          0|       0.0|
|          1|       1.0|
|          0|       0.0|
|          1|       1.0|
|          1|       0.0|
|          1|       0.0|
+-----------+----------+
only showing top 10 rows



AttributeError: 'str' object has no attribute '_jdf'

In [14]:
train_df.groupBy("fur_color").count().show()
train_df.groupBy("eye_color").count().show()

+---------+-----+
|fur_color|count|
+---------+-----+
|   orange|  843|
|     grey| 1063|
|   calico|  907|
|    white| 1061|
|      red|  864|
|    black| 1058|
|    brown| 1071|
|dark grey| 1101|
+---------+-----+

+----------+-----+
| eye_color|count|
+----------+-----+
|dark brown|  912|
|     green| 1648|
|     black| 1918|
|     brown| 1819|
|      blue| 1671|
+----------+-----+



In [18]:
train_df.show(5)
test_df.show(5)

+---------+------+---------+-----------+---------------+--------------------+
|fur_color|weight|eye_color|prehistoric|       features|      scaled_feature|
+---------+------+---------+-----------+---------------+--------------------+
|        5|   122|        1|          0|[5.0,122.0,1.0]|[2.21692727549965...|
|        1|   135|        2|          0|[1.0,135.0,2.0]|[0.44338545509993...|
|        4|    94|        4|          0| [4.0,94.0,4.0]|[1.77354182039972...|
|        5|    69|        3|          1| [5.0,69.0,3.0]|[2.21692727549965...|
|        6|   126|        2|          0|[6.0,126.0,2.0]|[2.66031273059959...|
+---------+------+---------+-----------+---------------+--------------------+
only showing top 5 rows

+---------+------+---------+-----------+---------------+--------------------+
|fur_color|weight|eye_color|prehistoric|       features|      scaled_feature|
+---------+------+---------+-----------+---------------+--------------------+
|        2|    69|        3|          1