In [0]:
import pyspark
from pyspark.sql import SparkSession

In [0]:
spark=SparkSession.builder.appName('log_reg_titanic').getOrCreate()

In [0]:
df=spark.read.csv('/FileStore/tables/titanic.csv',inferSchema=True, header=True)

In [0]:
df.show()

In [0]:
df.printSchema()

In [0]:
df.columns

In [0]:
# Fetching the essential fields to make the feature column
my_cols=df.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

In [0]:
# Getting rid of null values
my_final_data=my_cols.na.drop()

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder

In [0]:
# assigning number to every category of the column sex (technically indexing)
gender_indexer=StringIndexer(inputCol='Sex',outputCol='SexIndex')
# For example - indexing
# X Y Z
# 1 2 3
# Example - ONE HOT ENCODE
# KEY X Y Z
# Case : Y
# [0, 1, 0]
gender_encoder=OneHotEncoder(inputCol='SexIndex',outputCol='SexVector')

In [0]:
embarked_indexer=StringIndexer(inputCol='Embarked',outputCol='EmbarkedIndex')
embarked_encoder=OneHotEncoder(inputCol='EmbarkedIndex',outputCol='EmbarkedVector')

In [0]:
assembler=VectorAssembler(inputCols=['Pclass','SexVector','EmbarkedVector','Age','SibSp','Parch','Fare'],
                         outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
from pyspark.ml import Pipeline

In [0]:
log_reg_titanic=LogisticRegression(featuresCol='features',labelCol='Survived')

In [0]:
pipeline_titanic=Pipeline(stages=[gender_indexer,gender_encoder,embarked_indexer,embarked_encoder,assembler,log_reg_titanic])

In [0]:
train_data, test_data=my_final_data.randomSplit([0.7,0.3])

In [0]:
fit_model=pipeline_titanic.fit(train_data)

In [0]:
results=fit_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_evaluation=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [0]:
results.select('Survived','prediction').show()

In [0]:
# Area Under Curve
AUC=my_evaluation.evaluate(results)

In [0]:
AUC