# Logistic Regression Code Along

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LogReg').getOrCreate()

In [4]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [5]:
from pyspark.ml.classification import LogisticRegression

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [7]:
log_model = LogisticRegression()

In [8]:
fitted_logreg = log_model.fit(data)

In [9]:
log_summary = fitted_logreg.summary

In [11]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [12]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [13]:
final_model = LogisticRegression()

In [14]:
fit_final = final_model.fit(train_data)

In [15]:
prediction_labels = fit_final.evaluate(test_data)

In [17]:
prediction_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[20.4414932268445...|[0.99999999867452...|       0.0|
|  0.0|(692,[98,99,100,1...|[34.9503004580098...|[0.99999999999999...|       0.0|
|  0.0|(692,[121,122,123...|[15.8019208442183...|[0.99999986281301...|       0.0|
|  0.0|(692,[122,123,148...|[22.2952606382751...|[0.99999999979236...|       0.0|
|  0.0|(692,[124,125,126...|[40.9367690866527...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[25.4303377671945...|[0.99999999999096...|       0.0|
|  0.0|(692,[125,126,127...|[43.8364588270309...|           [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|[39.2868857626798...|           [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|[19.1836010014094...|[0.99999999533697...|       0.0|
|  0.0|(692,[126

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [19]:
my_eval = BinaryClassificationEvaluator()

In [23]:
my_final_roc = my_eval.evaluate(prediction_labels.predictions)

In [24]:
my_final_roc

1.0

# More Realisitic Example

In [25]:
titanic_data = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [26]:
titanic_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [27]:
titanic_data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [28]:
cols = titanic_data.select(['Survived',
                            'Pclass',
                            'Sex',
                            'Age',
                            'SibSp',
                            'Parch',
                            'Fare',
                            'Embarked'
                            ])

In [29]:
my_data = cols.na.drop()

In [30]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [31]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [32]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [33]:
assembler = VectorAssembler(inputCols=['Pclass',
                                       'SexVec',
                                       'EmbarkVec',
                                       'Age',
                                       'SibSp',
                                       'Parch',
                                       'Fare'
                                       ],
                                       outputCol='features')


In [34]:
from pyspark.ml import Pipeline

In [35]:
log_reg_titantic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [36]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titantic])


In [37]:
titanic_train, titanic_test = my_data.randomSplit([0.7,0.3])

In [39]:
fitted_model = pipeline.fit(titanic_train)

In [40]:
results = fitted_model.transform(titanic_test)

In [41]:
titanic_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [42]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [43]:
AUC = titanic_eval.evaluate(results)

In [44]:
AUC

0.7681725543478262