In [None]:
!pip install pyspark==3.0.1 py4j==0.10.9

Collecting pyspark==3.0.1
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 35 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 55.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=783f8b0e0731cb6c31753a3d7020aa08979aad8171050e2e4d4c6b5a8bb0106f
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .appName('Boston Housing Linear Regression example') \
      .getOrCreate()

In [None]:
spark

### 보스턴 주택 가격 예측 모델 제작

#### data 불러오기

In [None]:
!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/boston_housing.csv

--2022-03-07 14:05:23--  https://s3-geospatial.s3-us-west-2.amazonaws.com/boston_housing.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 52.218.208.25
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|52.218.208.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36240 (35K) [text/csv]
Saving to: ‘boston_housing.csv’


2022-03-07 14:05:23 (2.33 MB/s) - ‘boston_housing.csv’ saved [36240/36240]



In [None]:
!ls -tl

total 40
drwxr-xr-x 1 root root  4096 Feb 18 14:33 sample_data
-rw-r--r-- 1 root root 36240 Jan 31  2021 boston_housing.csv


In [None]:
data = spark.read.csv('./boston_housing.csv', header=True, inferSchema=True) # using by spark, read the csv file. # inferenceScheam => 테이블 구조 추론

In [None]:
data.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [None]:
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

#### 피쳐 벡터 만들기

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_columns = data.columns[:-1] # medv를 뺀 데이터들의 feature들의 컬럼 네임들
print(feature_columns)

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']


In [None]:
data_2 = assembler.transform(data) #  기존의 데이터에 피쳐들만 있는 하나의 컬럼을 추가

In [None]:
data_2.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

#### 훈련용 테스트 데이터를 나누고 선형 회귀 모형 하나 만들어보기

In [None]:
train, test = data_2.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol='features', labelCol='medv')
model = algo.fit(train)

#### 모델 성능 측정

In [None]:
evaluation_summary = model.evaluate(test)

In [None]:
evaluation_summary

<pyspark.ml.regression.LinearRegressionSummary at 0x7fdba9493d50>

In [None]:
evaluation_summary.rootMeanSquaredError

4.359218466480736

#### model 예측값 살펴보기

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show()

+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|   crim|  zn|indus|chas|   nox|   rm| age|    dis|rad|tax|ptratio|     b|lstat|medv|            features|        prediction|
+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|0.01301|35.0| 1.52|   0| 0.442|7.241|49.3| 7.0379|  1|284|   15.5|394.74| 5.49|32.7|[0.01301,35.0,1.5...|29.873900593630974|
|0.01381|80.0| 0.46|   0| 0.422|7.875|32.0| 5.6484|  4|255|   14.4|394.23| 2.97|50.0|[0.01381,80.0,0.4...|  40.3686112207802|
|0.01538|90.0| 3.75|   0| 0.394|7.454|34.2| 6.3361|  3|244|   15.9|386.34| 3.11|44.0|[0.01538,90.0,3.7...|36.770086200606244|
|0.01709|90.0| 2.02|   0|  0.41|6.728|36.1|12.1265|  5|187|   17.0|384.46|  4.5|30.1|[0.01709,90.0,2.0...|24.570904476902797|
|0.01951|17.5| 1.38|   0|0.4161|7.104|59.5| 9.2229|  3|216|   18.6|393.24| 8.05|33.0|[0.01951,17.5,1.3...|23.182889227

In [None]:
predictions.select(predictions.columns[-3:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|32.7|[0.01301,35.0,1.5...|29.873900593630974|
|50.0|[0.01381,80.0,0.4...|  40.3686112207802|
|44.0|[0.01538,90.0,3.7...|36.770086200606244|
|30.1|[0.01709,90.0,2.0...|24.570904476902797|
|33.0|[0.01951,17.5,1.3...|23.182889227314572|
|20.1|[0.01965,80.0,1.7...| 18.75170016716872|
|50.0|[0.02009,95.0,2.6...| 42.57856869478566|
|34.7|[0.02729,0.0,7.07...|30.618238477436186|
|26.6|[0.02899,40.0,1.2...|21.322846557907404|
|28.7|[0.02985,0.0,2.18...| 25.47071521961968|
|34.9|[0.0315,95.0,1.47...| 28.96034981581137|
|20.9|[0.03548,80.0,3.6...|20.432226919129775|
|35.4|[0.03705,20.0,3.3...| 34.48611935028715|
|23.2|[0.03871,52.5,5.3...|26.709768397119014|
|20.6|[0.04294,28.0,15....|26.902257200910377|
|20.5|[0.04337,21.0,5.6...|23.926125562579315|
|20.6|[0.04527,0.0,11.9...|22.232853603622903|
|23.3|[0.0456,0.0,13.89...|  26.0111093631068|
|28.2|[0.0493

In [None]:
model.save('boston_housing_model')

In [None]:
!ls -tl boston_housing_model

total 8
drwxr-xr-x 2 root root 4096 Mar  7 14:58 data
drwxr-xr-x 2 root root 4096 Mar  7 14:58 metadata


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
model_save_name = 'boston_housing_model'
path = f"/content/drive/MyDrive/{model_save_name}"
model.save(path)

In [None]:
from pyspark.ml.regression import LinearRegressionModel

loaded_model = LinearRegressionModel.load(path)

In [None]:
test.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
test.select(["features"]).show()

+--------------------+
|            features|
+--------------------+
|[0.01301,35.0,1.5...|
|[0.01381,80.0,0.4...|
|[0.01538,90.0,3.7...|
|[0.01709,90.0,2.0...|
|[0.01951,17.5,1.3...|
|[0.01965,80.0,1.7...|
|[0.02009,95.0,2.6...|
|[0.02729,0.0,7.07...|
|[0.02899,40.0,1.2...|
|[0.02985,0.0,2.18...|
|[0.0315,95.0,1.47...|
|[0.03548,80.0,3.6...|
|[0.03705,20.0,3.3...|
|[0.03871,52.5,5.3...|
|[0.04294,28.0,15....|
|[0.04337,21.0,5.6...|
|[0.04527,0.0,11.9...|
|[0.0456,0.0,13.89...|
|[0.04932,33.0,2.1...|
|[0.05302,0.0,3.41...|
+--------------------+
only showing top 20 rows



In [None]:
predictions2 = loaded_model.transform(test)

In [None]:
predictions2.select(predictions2.columns[-3:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|32.7|[0.01301,35.0,1.5...|29.873900593630974|
|50.0|[0.01381,80.0,0.4...|  40.3686112207802|
|44.0|[0.01538,90.0,3.7...|36.770086200606244|
|30.1|[0.01709,90.0,2.0...|24.570904476902797|
|33.0|[0.01951,17.5,1.3...|23.182889227314572|
|20.1|[0.01965,80.0,1.7...| 18.75170016716872|
|50.0|[0.02009,95.0,2.6...| 42.57856869478566|
|34.7|[0.02729,0.0,7.07...|30.618238477436186|
|26.6|[0.02899,40.0,1.2...|21.322846557907404|
|28.7|[0.02985,0.0,2.18...| 25.47071521961968|
|34.9|[0.0315,95.0,1.47...| 28.96034981581137|
|20.9|[0.03548,80.0,3.6...|20.432226919129775|
|35.4|[0.03705,20.0,3.3...| 34.48611935028715|
|23.2|[0.03871,52.5,5.3...|26.709768397119014|
|20.6|[0.04294,28.0,15....|26.902257200910377|
|20.5|[0.04337,21.0,5.6...|23.926125562579315|
|20.6|[0.04527,0.0,11.9...|22.232853603622903|
|23.3|[0.0456,0.0,13.89...|  26.0111093631068|
|28.2|[0.0493

### 타이타닉 생존 예측 모델 만들기

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Titinic Binary Classification example") \
    .getOrCreate()

In [None]:
spark

In [None]:
!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/titanic.csv

--2022-03-07 15:23:12--  https://s3-geospatial.s3-us-west-2.amazonaws.com/titanic.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 52.218.137.129
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|52.218.137.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61197 (60K) [text/csv]
Saving to: ‘titanic.csv’


2022-03-07 15:23:12 (2.29 MB/s) - ‘titanic.csv’ saved [61197/61197]



In [None]:
!ls

boston_housing.csv    drive   sample_data
boston_housing_model  gdrive  titanic.csv


In [None]:
data = spark.read.csv("titanic.csv", header=True, inferSchema=True)

In [None]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
data.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|Gender| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [None]:
data.select(['*']).describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|Gender|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

#### data cleanup
- Passenger ID, Name, Tichet, Embarked 제거
- Cabin도 결측치가 많아 사용하지 않을 예정
- Age 중요한 정보인데 비어있는 레코드가 있어서 디폴트를 채워줄 예정
- gender 카테고리 정보라 숫자로 인코딩 할 예정


In [None]:
final_data = data.select(['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
final_data.show()

+--------+------+------+----+-----+-----+-------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|null|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
|       1|     3|female| 4.0|    1|    1|   16.7|
|       1|     1|female|58.0|    0|    0|  26.55|
|       0|     3|  male|20.0|    0|    0|   8.05|
|       0|     3|  male|39.0|    1|    5| 31.275|
|       0|     3|female|14.0|    0|    0| 7.8542|
|       1|     2|female|55.0|    0|    0|   16.0|
|       0|     3|  male| 2.0|    4|    1| 29.125|


#### 1. Age를 평균으로 채운다.

In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(strategy='mean', inputCols=['Age'], outputCols=['AgeImputed'])
imputer_model = imputer.fit(final_data)
final_data = imputer_model.transform(final_data)

In [None]:
final_data.select('Age', 'AgeImputed').show()

+----+-----------------+
| Age|       AgeImputed|
+----+-----------------+
|22.0|             22.0|
|38.0|             38.0|
|26.0|             26.0|
|35.0|             35.0|
|35.0|             35.0|
|null|29.69911764705882|
|54.0|             54.0|
| 2.0|              2.0|
|27.0|             27.0|
|14.0|             14.0|
| 4.0|              4.0|
|58.0|             58.0|
|20.0|             20.0|
|39.0|             39.0|
|14.0|             14.0|
|55.0|             55.0|
| 2.0|              2.0|
|null|29.69911764705882|
|31.0|             31.0|
|null|29.69911764705882|
+----+-----------------+
only showing top 20 rows



#### 성별정보 인덱싱 

In [None]:
from pyspark.ml.feature import StringIndexer

gender_indexer = StringIndexer(inputCols=['Gender'], outputCols=['GenderIndexed'])
gender_indexer_model = gender_indexer.fit(final_data)
final_data = gender_indexer_model.transform(final_data)

final_data.select('Gender', 'GenderIndexed').show()

+------+-------------+
|Gender|GenderIndexed|
+------+-------------+
|  male|          0.0|
|female|          1.0|
|female|          1.0|
|female|          1.0|
|  male|          0.0|
|  male|          0.0|
|  male|          0.0|
|  male|          0.0|
|female|          1.0|
|female|          1.0|
|female|          1.0|
|female|          1.0|
|  male|          0.0|
|  male|          0.0|
|female|          1.0|
|female|          1.0|
|  male|          0.0|
|  male|          0.0|
|female|          1.0|
|female|          1.0|
+------+-------------+
only showing top 20 rows



### 피쳐 벡터 만들기

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Pclass', 'GenderIndexed', 'AgeImputed', 'SibSp', 'Parch', 'Fare'], outputCol='Features')
data_vec = assembler.transform(final_data)

In [None]:
data_vec.show()

+--------+------+------+----+-----+-----+-------+-----------------+-------------+--------------------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|       AgeImputed|GenderIndexed|            Features|
+--------+------+------+----+-----+-----+-------+-----------------+-------------+--------------------+
|       0|     3|  male|22.0|    1|    0|   7.25|             22.0|          0.0|[3.0,0.0,22.0,1.0...|
|       1|     1|female|38.0|    1|    0|71.2833|             38.0|          1.0|[1.0,1.0,38.0,1.0...|
|       1|     3|female|26.0|    0|    0|  7.925|             26.0|          1.0|[3.0,1.0,26.0,0.0...|
|       1|     1|female|35.0|    1|    0|   53.1|             35.0|          1.0|[1.0,1.0,35.0,1.0...|
|       0|     3|  male|35.0|    0|    0|   8.05|             35.0|          0.0|[3.0,0.0,35.0,0.0...|
|       0|     3|  male|null|    0|    0| 8.4583|29.69911764705882|          0.0|[3.0,0.0,29.69911...|
|       0|     1|  male|54.0|    0|    0|51.8625|             54.0|      

assember 는 제일 마지막에 하는 작업이다.

#### 훈련용 테스트 데이터 나누고 바이너리 클래스 분류 모델을 하나 만든다.

In [None]:
train, test = data_vec.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.classification import LogisticRegression

algo = LogisticRegression(featuresCol='Features', labelCol='Survived')
model = algo.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show()

+--------+------+------+----+-----+-----+-------+-----------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|       AgeImputed|GenderIndexed|            Features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+-------+-----------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|50.0|    0|    0|28.7125|             50.0|          1.0|[1.0,1.0,50.0,0.0...|[-2.1063041162026...|[0.10848560273159...|       1.0|
|       0|     1|  male|null|    0|    0|    0.0|29.69911764705882|          0.0|(6,[0,2],[1.0,29....|[-0.0729411473265...|[0.48177279380416...|       1.0|
|       0|     1|  male|null|    0|    0| 25.925|29.69911764705882|          0.0|[1.0,0.0,29.69911...|[-0.0638365811813...|[0.48404627210186...|       1.0|
|       0|     1|  male|null|    0|    0|   26.0|29.699117647058

In [None]:
predictions.select(['Survived', 'Prediction', 'probability']).show()

+--------+----------+--------------------+
|Survived|Prediction|         probability|
+--------+----------+--------------------+
|       0|       1.0|[0.10848560273159...|
|       0|       1.0|[0.48177279380416...|
|       0|       1.0|[0.48404627210186...|
|       0|       1.0|[0.48405285018822...|
|       0|       1.0|[0.48420377931497...|
|       0|       0.0|[0.68231428314977...|
|       0|       1.0|[0.44105413428493...|
|       0|       0.0|[0.51475551564118...|
|       0|       0.0|[0.62033220521631...|
|       0|       0.0|[0.62746731170902...|
|       0|       0.0|[0.57297306682432...|
|       0|       0.0|[0.58496235063323...|
|       0|       0.0|[0.74990679192955...|
|       0|       0.0|[0.73792630844369...|
|       0|       0.0|[0.71931574055615...|
|       0|       0.0|[0.72620353743468...|
|       0|       0.0|[0.75616999943994...|
|       0|       0.0|[0.77042238318069...|
|       0|       1.0|[0.11883838448550...|
|       0|       1.0|[0.18538625411278...|
+--------+-

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='Survived')
evaluator.evaluate(predictions)

NameError: ignored