# APACHE SPARK

### Dzień 2

#### Spark SQL + Spark ML

<br><br>
**ML**
- Transformer - algorytm przekształcający wejściowy DF w inny DF, np. wytrenowany model ML tworzący nowy DF zawierający predykcje (transform)
- Estymator - algorytm który na podstawie DF tworzy transformer (fit)
- Pipeline - szeregowe połączenie transformerów i estymatorów w celu utworzenia przepływu (workflow)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName('my_app').master("local[*]").getOrCreate()

### Wektory

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml import feature

**Dwa typy wektorów:**
- sparse - większość wartości to zera więc w celu optymalizacji zajmowanej pamięci podawane są tylko indeksy (wraz z wartościami) gdzie wartość != 0
- dense - podane są wszystkie wartości

In [4]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),"A",1),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),"B",6),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),"A",3),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),"B",2), 
        (Vectors.sparse(4, [(1, 1.0), (2, 2.0)]),"C",4)]
dummy_df = spark.createDataFrame(data, ["features", "categ", "num"])
dummy_df.show()

+--------------------+-----+---+
|            features|categ|num|
+--------------------+-----+---+
|(4,[0,3],[1.0,-2.0])|    A|  1|
|   [4.0,5.0,0.0,3.0]|    B|  6|
|   [6.0,7.0,0.0,8.0]|    A|  3|
| (4,[0,3],[9.0,1.0])|    B|  2|
| (4,[1,2],[1.0,2.0])|    C|  4|
+--------------------+-----+---+



In [5]:
dummy_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- categ: string (nullable = true)
 |-- num: long (nullable = true)



**Przechodzenie od kolumny kategorycznej do wektora**

In [6]:
indexer = feature.StringIndexer(inputCol="categ", outputCol="categIndex")

In [7]:
IDXmodel = indexer.fit(dummy_df)

In [8]:
dummy_df1 = IDXmodel.transform(dummy_df)
dummy_df1.show()

+--------------------+-----+---+----------+
|            features|categ|num|categIndex|
+--------------------+-----+---+----------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       0.0|
|   [4.0,5.0,0.0,3.0]|    B|  6|       1.0|
|   [6.0,7.0,0.0,8.0]|    A|  3|       0.0|
| (4,[0,3],[9.0,1.0])|    B|  2|       1.0|
| (4,[1,2],[1.0,2.0])|    C|  4|       2.0|
+--------------------+-----+---+----------+



In [9]:
# OneHotEncoderEstimator - spark 2.x
OHencoder = feature.OneHotEncoder(inputCols=["categIndex"], outputCols=["categVect"])

In [10]:
OHmodel = OHencoder.fit(dummy_df1)

In [11]:
dummy_df2 = OHmodel.transform(dummy_df1)
dummy_df2.show()

+--------------------+-----+---+----------+-------------+
|            features|categ|num|categIndex|    categVect|
+--------------------+-----+---+----------+-------------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       0.0|(2,[0],[1.0])|
|   [4.0,5.0,0.0,3.0]|    B|  6|       1.0|(2,[1],[1.0])|
|   [6.0,7.0,0.0,8.0]|    A|  3|       0.0|(2,[0],[1.0])|
| (4,[0,3],[9.0,1.0])|    B|  2|       1.0|(2,[1],[1.0])|
| (4,[1,2],[1.0,2.0])|    C|  4|       2.0|    (2,[],[])|
+--------------------+-----+---+----------+-------------+



**Łączenie zmiennych w wektory**

In [12]:
vectAssembler = feature.VectorAssembler(inputCols = ["features", "categVect", "num"], outputCol = "featuresFull")
dummy_df3 = vectAssembler.transform(dummy_df2)
dummy_df3.show(truncate=False)

+--------------------+-----+---+----------+-------------+------------------------------+
|features            |categ|num|categIndex|categVect    |featuresFull                  |
+--------------------+-----+---+----------+-------------+------------------------------+
|(4,[0,3],[1.0,-2.0])|A    |1  |0.0       |(2,[0],[1.0])|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0]   |B    |6  |1.0       |(2,[1],[1.0])|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0]   |A    |3  |0.0       |(2,[0],[1.0])|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|(4,[0,3],[9.0,1.0]) |B    |2  |1.0       |(2,[1],[1.0])|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(4,[1,2],[1.0,2.0]) |C    |4  |2.0       |(2,[],[])    |(7,[1,2,6],[1.0,2.0,4.0])     |
+--------------------+-----+---+----------+-------------+------------------------------+



**Alternatywne podejście**

In [13]:
rf = feature.RFormula(formula="~ features + categ + num", featuresCol='featuresFull')

In [14]:
rfModel = rf.fit(dummy_df)

In [15]:
rfModel.transform(dummy_df).show(truncate=False)

+--------------------+-----+---+------------------------------+
|features            |categ|num|featuresFull                  |
+--------------------+-----+---+------------------------------+
|(4,[0,3],[1.0,-2.0])|A    |1  |[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0]   |B    |6  |[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0]   |A    |3  |[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|(4,[0,3],[9.0,1.0]) |B    |2  |[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(4,[1,2],[1.0,2.0]) |C    |4  |(7,[1,2,6],[1.0,2.0,4.0])     |
+--------------------+-----+---+------------------------------+



**Skalowanie zmiennych**

In [16]:
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

In [17]:
scalerModel = scaler.fit(dummy_df3)

In [18]:
dummy_df3.select("featuresFull").show(truncate=False)
scalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+------------------------------+
|featuresFull                  |
+------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(7,[1,2,6],[1.0,2.0,4.0])     |
+------------------------------+

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,1.8257418583505538,0.0,0.5198752449100363]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,0.0,1.8257418583505538,3.119251469460218]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.8257418583505538,0.0,1.559625734730109] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,0.0,1.8257418583505

Skalowanie min-max *(w wyniku transformacji powstaje DenseVector)*

In [19]:
MMscaler = feature.MinMaxScaler(inputCol="featuresFull", outputCol="featuresScal")

In [20]:
MMscalerModel = MMscaler.fit(dummy_df3)

In [21]:
dummy_df3.select("featuresFull").show(truncate=False)
MMscalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+------------------------------+
|featuresFull                  |
+------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(7,[1,2,6],[1.0,2.0,4.0])     |
+------------------------------+

+------------------------------------------------------------+
|featuresScal                                                |
+------------------------------------------------------------+
|(7,[0,4],[0.1111111111111111,1.0])                          |
|[0.4444444444444444,0.7142857142857142,0.0,0.5,0.0,1.0,1.0] |
|[0.6666666666666666,1.0,0.0,1.0,1.0,0.0,0.4]                |
|[1.0,0.0,0.0,0.30000000000000004,0.0,1.0,0.2]               |
|[0.0,0.14285714285714285,1.0,0.2,0.0,0.0,0.6000000000000001]|
+------------------------------------------------------------+



**PCA**

In [22]:
pca = feature.PCA(k=3, inputCol="featuresFull", outputCol="featuresPCA")

In [23]:
PCAmodel = pca.fit(dummy_df3)

In [24]:
PCAmodel.transform(dummy_df3).select("featuresFull", "featuresPCA").show(truncate=False)

+------------------------------+--------------------------------------------------------------+
|featuresFull                  |featuresPCA                                                   |
+------------------------------+--------------------------------------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|[0.8308626363574695,0.8263675323478626,-1.3110493279251592]   |
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |[-7.369784494423485,-0.85243993577948,-5.302300671552502]     |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |[-12.433855979366028,0.006784898917441828,-1.2633329470730525]|
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |[-4.867266422112706,6.862745746174966,-3.6581085992457143]    |
|(7,[1,2,6],[1.0,2.0,4.0])     |[-0.9319103583071204,-1.8364015478262747,-3.568130080877007]  |
+------------------------------+--------------------------------------------------------------+



In [25]:
PCAmodel.explainedVariance

DenseVector([0.6419, 0.27, 0.0687])

### Pipeline

In [26]:
from pyspark.ml import Pipeline

In [27]:
# przygotowanie estymatorów/transformerów
indexer = feature.StringIndexer(inputCol="categ", outputCol="categIndex")
OHencoder = feature.OneHotEncoder(inputCols=["categIndex"], outputCols=["categVect"])
vectAssembler = feature.VectorAssembler(inputCols = ["features", "num", "categVect"], outputCol = "featuresFull")
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

In [28]:
# utworzenie estymatora
pipeline = Pipeline(stages=[indexer, OHencoder, vectAssembler, scaler])

In [29]:
# utworzenie transformera
pipelineModel = pipeline.fit(dummy_df)

In [30]:
# transformacja
pipelineModel.transform(dummy_df).select("featuresScal").show(truncate=False)

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,0.5198752449100363,1.8257418583505538,0.0]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,3.119251469460218,0.0,1.8257418583505538]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.559625734730109,1.8257418583505538,0.0] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,1.0397504898200727,0.0,1.8257418583505538]               |
|(7,[1,2,4],[0.3115884764248779,2.23606797749979,2.0795009796401454])                                   |
+-------------------------------------------------------------------------------------------------------+



In [31]:
pipelineModel.save("pipe")

In [32]:
from pyspark.ml import PipelineModel

In [33]:
pip = PipelineModel.load("pipe")

In [34]:
pip.transform(dummy_df).select("featuresScal").show(truncate=False)

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,0.5198752449100363,1.8257418583505538,0.0]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,3.119251469460218,0.0,1.8257418583505538]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.559625734730109,1.8257418583505538,0.0] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,1.0397504898200727,0.0,1.8257418583505538]               |
|(7,[1,2,4],[0.3115884764248779,2.23606797749979,2.0795009796401454])                                   |
+-------------------------------------------------------------------------------------------------------+



> **ZADANIE:**
- przygotuj poniższe dane
- usuń wiersze zawierające braki danych
- stwórz kolumnę zawierającą miesiąc wyciągnięty z kolumny `start_time`
- stwórz kolumnę zawierającą informację o godzinie o której wystąpiło wypożyczenie
- stwórz kolumnę zawierającą informacje o przedziale wiekowym wypożyczającego (przedziały: <20, 20-40, 40-60, 60<)
- zaokrąglij do jednego miejsca po przecinku wartości w kolumnach `start_station_longitude` oraz `start_station_latitude`
- usuń kolumny: `start_time`, `end_time`, `start_station_name`, `start_station_id`, `end_station_id`, `end_station_name`, `end_station_latitude`, `end_station_longitude`, `member_birth_year`, `bike_id`
- zmień nazwę kolumny `duration_sec` na `label`
- z pozostałych zmiennych stwórz kolumnę `features` zawierającą wektory
- wynikowemu DataFrameowi nadaj nazwę `goBike_processed`

In [35]:
goBike = spark.read.csv("./2017-fordgobike-tripdata.csv", header=True, inferSchema=True)

In [36]:
goBike.printSchema()

root
 |-- duration_sec: integer (nullable = true)
 |-- start_time: string (nullable = true)
 |-- end_time: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_latitude: double (nullable = true)
 |-- start_station_longitude: double (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_latitude: double (nullable = true)
 |-- end_station_longitude: double (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- user_type: string (nullable = true)
 |-- member_birth_year: integer (nullable = true)
 |-- member_gender: string (nullable = true)



In [37]:
goBike_processed = goBike.dropna("any")

In [38]:
goBike_processed = goBike_processed.withColumn("month",f.month("start_time"))\
                                   .withColumn("hour", f.hour("start_time"))

In [39]:
age = 2017 - f.col("member_birth_year")

goBike_processed = goBike_processed.withColumn("age_bucket", 
                                               f.when(age < 20, "<20")\
                                               .when(age <= 40, "20-40")\
                                               .when(age <= 60, "40-60")\
                                               .otherwise("60<"))

In [40]:
goBike_processed = goBike_processed.withColumn("start_station_longitude", f.round("start_station_longitude", 1)) \
.withColumn("start_station_latitude", f.round("start_station_latitude", 1))

In [41]:
goBike_processed = goBike_processed.drop("start_time", "end_time", "start_station_name", "start_station_id", 
                                         "end_station_id", "end_station_name", "end_station_latitude", 
                                         "end_station_longitude", "member_birth_year", "bike_id")

In [42]:
goBike_processed.show()

+------------+----------------------+-----------------------+----------+-------------+-----+----+----------+
|duration_sec|start_station_latitude|start_station_longitude| user_type|member_gender|month|hour|age_bucket|
+------------+----------------------+-----------------------+----------+-------------+-----+----+----------+
|       80110|                  37.8|                 -122.4|  Customer|         Male|   12|  16|     20-40|
|       78800|                  37.8|                 -122.4|  Customer|       Female|   12|  15|     40-60|
|       43603|                  37.9|                 -122.3|Subscriber|       Female|   12|  14|     20-40|
|        4507|                  37.9|                 -122.3|  Customer|       Female|   12|  23|     20-40|
|        2183|                  37.8|                 -122.4|Subscriber|         Male|   12|  23|     20-40|
|        2170|                  37.8|                 -122.4|Subscriber|         Male|   12|  23|     20-40|
|        1544|     

In [43]:
goBike_processed = goBike_processed.select("duration_sec", f.col("start_station_latitude").cast("string"), 
                                           f.col("start_station_longitude").cast("string"), "user_type", 
                                           "member_gender", f.col("month").cast("string"), 
                                           f.col("hour").cast("string"), "age_bucket")

In [44]:
formula = ("duration_sec ~ user_type + member_gender + age_bucket + start_station_latitude + "
           "start_station_longitude + month + hour")
rf = feature.RFormula(formula=formula)

In [45]:
rfModel = rf.fit(goBike_processed)

In [46]:
rfModel.transform(goBike_processed).select("label", "features").show(truncate=False)

+-------+-----------------------------------------------------+
|label  |features                                             |
+-------+-----------------------------------------------------+
|80110.0|(42,[1,3,6,10,16,23],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|78800.0|(42,[2,4,6,10,16,26],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|43603.0|(42,[0,2,3,8,11,16,31],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|4507.0 |(42,[2,3,8,11,16,36],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|2183.0 |(42,[0,1,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|2170.0 |(42,[0,1,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1544.0 |(42,[0,2,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1474.0 |(42,[0,1,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1532.0 |(42,[0,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|1216.0 |(42,[0,1,4,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|386.0  |(42,[0,1,3,7,12,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|422.0  |(42,[0,1,3,6,10,16,36],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|871.0  |(42,[0,1,3,6,10,16,36],[1.0,1.0

----------

In [47]:
q = ("select duration_sec, cast(month(start_time) as string) month, cast(hour(start_time) as string) hour, "
     "case when 2017 - member_birth_year < 20 then '<20' when 2017 - member_birth_year <= 40 then '20-40' "
     "when 2017 - member_birth_year < 60 then '40-60' else '60<' end age_bucket, " 
     "cast(round(start_station_latitude, 1) as string) start_station_latitude, "
     "cast(round(start_station_longitude, 1) as string) start_station_longitude, "
     "user_type, member_gender "
     "from __THIS__ "
     "where member_birth_year is not null and member_gender is not null")
sqlt = feature.SQLTransformer(statement=q)

In [48]:
formula = ("duration_sec ~ user_type + member_gender + age_bucket + start_station_latitude + " 
           "start_station_longitude + month + hour")
rf = feature.RFormula(formula=formula)

In [49]:
pipe = Pipeline(stages=[sqlt, rf])
pipeM = pipe.fit(goBike)

In [50]:
pipeM.transform(goBike).select("label", "features").show()

+-------+--------------------+
|  label|            features|
+-------+--------------------+
|80110.0|(43,[1,3,6,10,17,...|
|78800.0|(43,[2,4,6,10,17,...|
|43603.0|(43,[0,2,3,8,12,1...|
| 4507.0|(43,[2,3,8,12,17,...|
| 2183.0|(43,[0,1,3,6,10,1...|
| 2170.0|(43,[0,1,3,6,10,1...|
| 1544.0|(43,[0,2,3,6,10,1...|
| 1474.0|(43,[0,1,3,6,10,1...|
| 1532.0|(43,[0,3,6,10,17,...|
| 1216.0|(43,[0,1,4,6,10,1...|
|  386.0|(43,[0,1,3,7,11,1...|
|  422.0|(43,[0,1,3,6,10,1...|
|  871.0|(43,[0,1,3,6,10,1...|
|  733.0|(43,[0,2,3,6,10,1...|
|  781.0|(43,[2,3,8,12,17,...|
|  475.0|(43,[0,1,3,6,10,1...|
|  152.0|(43,[0,1,3,6,10,1...|
|  249.0|(43,[0,1,3,7,11,1...|
|  243.0|(43,[0,1,3,6,10,1...|
|  833.0|(43,[0,1,3,6,10,1...|
+-------+--------------------+
only showing top 20 rows



### Klasyfikacja

In [51]:
from pyspark.ml import classification

#### Dane

https://archive.ics.uci.edu/ml/datasets/adult

In [52]:
col_names = ["age", "workclass", "fnlwgt", "education", "education-num","marital-status", "occupation", 
             "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", 
             "native-country", "earnings"]

In [53]:
df = spark.read.csv("./adult.data", header=False, inferSchema=True, ignoreLeadingWhiteSpace=True)

In [54]:
#df.toDF(*col_names).show()
df = df.select(*[f.col(old).alias(new) for old, new in zip(df.columns, col_names)]).drop("fnlwgt").dropna("any")

In [55]:
df.show(3, vertical=True)

-RECORD 0----------------------------
 age            | 39                 
 workclass      | State-gov          
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Never-married      
 occupation     | Adm-clerical       
 relationship   | Not-in-family      
 race           | White              
 sex            | Male               
 capital-gain   | 2174               
 capital-loss   | 0                  
 hours-per-week | 40                 
 native-country | United-States      
 earnings       | <=50K              
-RECORD 1----------------------------
 age            | 50                 
 workclass      | Self-emp-not-inc   
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Married-civ-spouse 
 occupation     | Exec-managerial    
 relationship   | Husband            
 race           | White              
 sex            | Male               
 capital-gain   | 0                  
 capital-los

> **ZADANIE:**
- przygotuj dane
- podziel `df` na zbiór treningowy i ewaluacyjny
- na podstawie kolumny `earnings` stwórz zmienną celu `label` z wartościami zakodowanymi jako 0 i 1
- stwórz (przeskalowaną) kolumnę `features` zawierającą wektory powstałe na podstawie pozostałych kolumn
- wynikowe DFy nazwij `df_train` i `df_eval`

In [56]:
df_t, df_e = df.randomSplit([0.7, 0.3], 42)

In [57]:
rf = feature.RFormula(formula="earnings ~ .", featuresCol='featuresRaw')
rfModel = rf.fit(df_t)

In [58]:
df_train = rfModel.transform(df_t)
df_eval = rfModel.transform(df_e)

In [59]:
scaler = feature.StandardScaler(inputCol="featuresRaw", outputCol="features")
scal_mod = scaler.fit(df_train)
df_train = scal_mod.transform(df_train)
df_eval = scal_mod.transform(df_eval)

**Ostatnie przygotowania**

In [64]:
df_train = df_train.select("label", "features")
df_eval = df_eval.select("label", "features")

In [65]:
df_train.cache()
df_eval.cache()

DataFrame[label: double, features: vector]

In [66]:
print("Train:")
df_train.groupBy("label").count().show()
print("Eval:")
df_eval.groupBy("label").count().show()

Train:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|17414|
|  1.0| 5461|
+-----+-----+

Eval:
+-----+-----+
|label|count|
+-----+-----+
|  0.0| 7306|
|  1.0| 2380|
+-----+-----+



#### Regresja logistyczna

In [67]:
lr = classification.LogisticRegression(maxIter=100)

In [68]:
lrModel = lr.fit(df_train)

In [69]:
lrModel.coefficients

DenseVector([0.3396, 1.228, 2.2866, 0.261, 0.3952, -0.9359, -0.665, -0.5509, -0.1002, -0.452, -0.333, -0.2747, -0.2223, 0.4265, 0.3536, 0.067, -0.0029, 0.1003, 0.2135, 0.0044, 0.2636, 0.2917, -0.0026, 0.2465, 0.1333, -0.038, 0.3165, 0.2322, -0.5753, -1.804, -1.1603, -0.617, -0.5888, -0.4037, -0.2159, 0.0001, -0.0874, -0.2346, -0.168, -0.4998, -0.2313, -0.6899, -0.205, -0.3164, -0.3172, -0.0365, -0.0494, -0.7145, 0.3619, 0.5864, 0.0227, 0.3736, 0.4245, 0.2002, 0.1286, 0.1182, -0.0081, 0.4113, -1.4262, -0.757, -0.6814, -0.3164, -0.3038, -0.2765, -0.3077, -0.2575, -0.2958, -0.2605, -0.2854, -0.3181, -0.2302, -0.273, -0.2852, -0.2807, -0.1979, -0.1842, -0.2828, -0.2797, -0.1969, -0.1972, -0.1796, -0.1788, -0.1952, -0.1665, -0.1413, -0.1636, -0.3715, -0.1156, -0.1163, -0.1565, -0.1119, -0.1255, -0.1401, -0.1342, -0.1106, -0.097, -0.2555, -0.1059, -0.2135])

In [70]:
lrModel.intercept

-2.2624443297205974

In [71]:
trainingSummary = lrModel.summary
type(trainingSummary)

pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary

In [72]:
trainingSummary.roc.show(120)

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|1.148501205926266...|0.040835011902581944|
|4.019754220741932E-4| 0.07782457425379967|
| 9.18800964741013E-4|  0.1138985533785021|
|0.001722751808889...|  0.1488738326313862|
|0.003502928678075112| 0.18165171214063358|
|0.005168255426668198|  0.2142464750045779|
|0.007005857356150224| 0.24684123786852225|
|0.009819685310669577| 0.27797106757004214|
|0.012461238084299988|  0.3094671305621681|
| 0.01579189158148616| 0.34078007690899104|
|0.019524520500746524|  0.3713605566746017|
|0.023371999540599517| 0.40102545321369715|
|0.027851154243711956|  0.4284929500091558|
| 0.03238773400712071| 0.45522798022340233|
| 0.03761341449408522|  0.4814136605017396|
| 0.04335592052371655|  0.5057681743270463|
|0.049041001493051566|   0.526826588536898|
| 0.05501320776386815|   0.548800585973265|
| 0.06035373837142529|  0.572971

In [73]:
trainingSummary.roc.toPandas()

Unnamed: 0,FPR,TPR
0,0.000000,0.000000
1,0.000115,0.040835
2,0.000402,0.077825
3,0.000919,0.113899
4,0.001723,0.148874
...,...,...
98,0.971977,1.000000
99,0.985529,1.000000
100,0.998105,1.000000
101,1.000000,1.000000


In [74]:
trainingSummary.pr.show(120)

+--------------------+-------------------+
|              recall|          precision|
+--------------------+-------------------+
|                 0.0| 0.9911111111111112|
|0.040835011902581944| 0.9911111111111112|
| 0.07782457425379967| 0.9837962962962963|
|  0.1138985533785021| 0.9749216300940439|
|  0.1488738326313862| 0.9644128113879004|
| 0.18165171214063358| 0.9420702754036088|
|  0.2142464750045779| 0.9285714285714286|
| 0.24684123786852225| 0.9170068027210885|
| 0.27797106757004214| 0.8987566607460036|
|  0.3094671305621681| 0.8862087047718931|
| 0.34078007690899104| 0.8712546816479401|
|  0.3713605566746017|  0.856418918918919|
| 0.40102545321369715| 0.8432807085098191|
|  0.4284929500091558| 0.8283185840707965|
| 0.45522798022340233| 0.8150819672131148|
|  0.4814136605017396| 0.8005481120584653|
|  0.5057681743270463| 0.7853284048905317|
|   0.526826588536898| 0.7711069418386491|
|   0.548800585973265| 0.7577749683944375|
|  0.5729719831532686| 0.7485645933014354|
|  0.592931

In [75]:
trainingSummary.areaUnderROC

0.9081972974910664

In [76]:
trainingSummary.accuracy

0.851672131147541

In [77]:
trainingSummary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[7.66561280375108...|[0.99953155008331...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.47337212749488...|[0.99943231278906...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.28643371071673...|[0.99931570280709...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.59414014278455...|[0.99981484649374...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.53952172467252...|[0.99980445446485...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.18708048486948...|[0.99972185240590...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.02687992132264...|[0.99967354107904...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.02687992132264...|[0.99967354107904...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.3860776671353,...|[0.99938056160037...|       0.0|
|  0.0|(99,[0,1,

In [78]:
# predykcje
lrModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[7.51770177557047...|[0.99945691516043...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.18708048486948...|[0.99972185240590...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.12300025945074...|[0.99970345063257...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8.02687992132264...|[0.99967354107904...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.3860776671353,...|[0.99938056160037...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[-2.6753926780805...|[0.06444108784158...|       1.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.53929095479397...|[0.99946850809351...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[6.83440847518789...|[0.99892505308527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[6.83440847518789...|[0.99892505308527...|       0.0|
|  0.0|(99,[0,1,

#### SVM

In [79]:
svm = classification.LinearSVC(maxIter=100)

In [80]:
svmModel = svm.fit(df_train)

In [81]:
svmModel.coefficients

DenseVector([0.1037, -0.1115, 1.8404, 0.1956, 0.158, -0.0962, -0.1217, -0.0696, -0.0692, -0.0671, 0.0092, 0.0041, -0.0106, -0.2992, -0.1671, 0.2428, 0.1792, -0.0653, -0.1582, -0.0117, -0.1695, -0.1963, 0.1324, -0.1192, -0.0856, 0.1232, -0.1036, -0.0802, 0.0385, -0.4856, -0.2681, -0.1325, -0.1281, -0.1036, -0.1047, 0.1409, 0.0867, -0.0885, -0.0471, -0.1859, -0.0955, -0.098, -0.0887, -0.1029, -0.1267, 0.0521, -0.0222, -0.1544, 0.1035, -0.1573, -0.2192, -0.13, 0.1345, -0.1181, -0.1006, -0.0554, -0.0622, 0.139, -0.1386, -0.1356, -0.119, -0.0239, -0.0326, -0.0159, -0.0385, -0.0279, -0.0712, -0.0271, -0.0404, -0.0922, -0.0142, -0.0443, -0.0778, -0.0613, -0.0171, -0.0025, -0.0718, -0.0283, -0.0192, -0.0185, -0.0198, -0.0177, -0.0393, -0.0381, -0.0101, -0.0366, -0.0473, 0.0075, -0.0101, -0.0256, -0.0209, -0.0099, -0.0186, -0.0244, -0.0106, -0.0134, -0.0257, -0.007, -0.009])

In [82]:
svmModel.intercept

-0.08248804569322007

In [83]:
# predykcje
svmModel.transform(df_eval).show()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[3.86316672283258...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[4.17709814396431...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[4.15148704181428...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[4.11307038858925...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.85695936708899...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[-4.3449015711089...|       1.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.97208784267967...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.6903657190294,...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.6903657190294,...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.56231020827927...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.56231020827927...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.56231020827927...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[3.56231020827927...|       0.0|
|  0.0|(99,[0,1,2,4,8,20

#### Drzewo decyzyjne

In [84]:
tree = classification.DecisionTreeClassifier()

In [85]:
treeModel = tree.fit(df_train)

In [86]:
treeModel.depth

5

In [87]:
treeModel.numNodes

35

In [88]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_54a9d00f5438, depth=5, numNodes=35, numClasses=2, numFeatures=99
  If (feature 28 <= 1.0035900941414533)
   If (feature 2 <= 1.0012372970722383)
    If (feature 1 <= 4.873830618810881)
     If (feature 4 <= 3.6074818888073166)
      If (feature 48 <= 1.0192656067064028)
       Predict: 0.0
      Else (feature 48 > 1.0192656067064028)
       Predict: 1.0
     Else (feature 4 > 3.6074818888073166)
      Predict: 0.0
    Else (feature 1 > 4.873830618810881)
     Predict: 0.0
   Else (feature 2 > 1.0012372970722383)
    If (feature 0 <= 1.4959581701914946)
     Predict: 0.0
    Else (feature 0 > 1.4959581701914946)
     Predict: 1.0
  Else (feature 28 > 1.0035900941414533)
   If (feature 1 <= 4.873830618810881)
    If (feature 2 <= 1.0012372970722383)
     If (feature 1 <= 3.7041112702962695)
      If (feature 3 <= 4.522820166787641)
       Predict: 0.0
      Else (feature 3 > 4.522820166787641)
       Predict: 1.0
     Else (feat

In [89]:
# predykcje
treeModel.transform(df_eval).show()

+-----+--------------------+--------------+--------------------+----------+
|label|            features| rawPrediction|         probability|prediction|
+-----+--------------------+--------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|     [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[

#### Las losowy

In [90]:
forest = classification.RandomForestClassifier()

In [91]:
forestModel = forest.fit(df_train)

In [92]:
forestModel.featureImportances

SparseVector(99, {0: 0.0583, 1: 0.1556, 2: 0.1219, 3: 0.0223, 4: 0.0337, 5: 0.0, 6: 0.0002, 7: 0.0002, 8: 0.0009, 9: 0.0001, 10: 0.0026, 11: 0.0, 13: 0.0148, 14: 0.0025, 15: 0.012, 16: 0.0073, 17: 0.0001, 19: 0.0006, 20: 0.0002, 21: 0.0, 22: 0.0084, 23: 0.0008, 24: 0.0002, 25: 0.003, 28: 0.2017, 29: 0.081, 30: 0.0021, 32: 0.0, 34: 0.0023, 35: 0.0152, 36: 0.0226, 37: 0.0004, 39: 0.0041, 40: 0.0009, 41: 0.0002, 42: 0.0, 43: 0.0021, 44: 0.0008, 46: 0.0, 48: 0.1264, 49: 0.0128, 50: 0.0086, 51: 0.0032, 52: 0.0252, 53: 0.0017, 54: 0.0004, 55: 0.0001, 56: 0.0001, 57: 0.0412, 58: 0.0002, 59: 0.0001, 60: 0.0001, 61: 0.0001, 64: 0.0, 65: 0.0, 67: 0.0002, 68: 0.0001, 69: 0.0, 71: 0.0, 74: 0.0001, 79: 0.0, 90: 0.0001})

In [93]:
print(forestModel.toDebugString)

RandomForestClassificationModel: uid=RandomForestClassifier_989af717601e, numTrees=20, numClasses=2, numFeatures=99
  Tree 0 (weight 1.0):
    If (feature 0 <= 2.0797467244125656)
     If (feature 52 <= 2.350163942221254)
      If (feature 67 <= 9.11743149204179)
       Predict: 0.0
      Else (feature 67 > 9.11743149204179)
       If (feature 28 <= 1.0035900941414533)
        If (feature 29 <= 1.0625686720834735)
         Predict: 1.0
        Else (feature 29 > 1.0625686720834735)
         Predict: 0.0
       Else (feature 28 > 1.0035900941414533)
        Predict: 0.0
     Else (feature 52 > 2.350163942221254)
      If (feature 16 <= 2.241715919048179)
       If (feature 15 <= 1.3537338657788678)
        If (feature 3 <= 3.9022099397086936)
         Predict: 0.0
        Else (feature 3 > 3.9022099397086936)
         Predict: 1.0
       Else (feature 15 > 1.3537338657788678)
        Predict: 1.0
      Else (feature 16 > 2.241715919048179)
       Predict: 1.0
    Else (feature 0 > 2.079

In [94]:
# predykcje
forestModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[19.0363024947125...|[0.95181512473562...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[19.0363024947125...|[0.95181512473562...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[19.0363024947125...|[0.95181512473562...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[19.0363024947125...|[0.95181512473562...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[19.0363024947125...|[0.95181512473562...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[12.2276871876838...|[0.61138435938419...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[18.5230349121901...|[0.92615174560950...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[18.5230349121901...|[0.92615174560950...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[18.5230349121901...|[0.92615174560950...|       0.0|
|  0.0|(99,[0,1,

#### Gradient-Boosted Trees

In [95]:
gbt = classification.GBTClassifier()

In [96]:
gbtModel = gbt.fit(df_train)

In [97]:
gbtModel.featureImportances

SparseVector(99, {0: 0.1402, 1: 0.14, 2: 0.0998, 3: 0.0561, 4: 0.0794, 5: 0.0, 6: 0.0175, 7: 0.0021, 8: 0.0007, 9: 0.0007, 10: 0.0059, 11: 0.0063, 13: 0.0105, 14: 0.0055, 15: 0.003, 16: 0.001, 17: 0.0015, 19: 0.0002, 20: 0.0011, 22: 0.0015, 24: 0.0003, 26: 0.0004, 28: 0.2426, 29: 0.0004, 30: 0.0012, 31: 0.002, 34: 0.0023, 35: 0.0478, 36: 0.0137, 37: 0.001, 38: 0.0029, 39: 0.0199, 40: 0.0004, 42: 0.0015, 43: 0.0062, 44: 0.0198, 45: 0.0038, 46: 0.0005, 48: 0.0065, 49: 0.0032, 50: 0.0016, 51: 0.0004, 52: 0.0122, 53: 0.0016, 54: 0.0006, 55: 0.0013, 56: 0.0004, 57: 0.0115, 58: 0.0068, 59: 0.0003, 60: 0.0012, 61: 0.0016, 62: 0.0007, 63: 0.0008, 64: 0.0003, 65: 0.0006, 67: 0.0003, 69: 0.0017, 71: 0.0004, 75: 0.0004, 78: 0.0004, 79: 0.0006, 81: 0.0001, 84: 0.0009, 87: 0.0013, 88: 0.0008, 89: 0.0005, 93: 0.001})

In [98]:
print(gbtModel.toDebugString)

GBTClassificationModel: uid = GBTClassifier_b57650227722, numTrees=20, numClasses=2, numFeatures=99
  Tree 0 (weight 1.0):
    If (feature 28 <= 1.0035900941414533)
     If (feature 2 <= 1.0012372970722383)
      If (feature 1 <= 4.873830618810881)
       If (feature 4 <= 3.6074818888073166)
        If (feature 48 <= 1.0192656067064028)
         Predict: -0.9711654955173249
        Else (feature 48 > 1.0192656067064028)
         Predict: 0.3333333333333333
       Else (feature 4 > 3.6074818888073166)
        If (feature 0 <= 3.1013766942994403)
         Predict: -0.9072164948453608
        Else (feature 0 > 3.1013766942994403)
         Predict: -0.7106598984771574
      Else (feature 1 > 4.873830618810881)
       If (feature 4 <= 3.5264148800700736)
        If (feature 0 <= 2.444614570800735)
         Predict: -0.9536019536019537
        Else (feature 0 > 2.444614570800735)
         Predict: -0.6885644768856448
       Else (feature 4 > 3.5264148800700736)
        If (feature 0 <= 2.298

In [99]:
# predykcje
gbtModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[1.12717109496054...|[0.90502442593822...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[1.51172134930845...|[0.95362202392527...|       0.0|
|  0.0|(99,[0,1,

#### Naiwny Bayes

In [100]:
bayes = classification.NaiveBayes()

In [101]:
bayesModel = bayes.fit(df_train)

In [102]:
# predykcje
bayesModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[-130.22225462507...|[0.99999999999999...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-117.69902941287...|[0.99999999999996...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-118.08460833743...|[0.99999999999996...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-118.66297672428...|[0.99999999999996...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-122.51876596994...|[0.99999999999996...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[-154.62418932850...|[0.99999991987258...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-123.28386080512...|[0.99999999999994...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-127.52522897534...|[0.99999999999993...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[-127.52522897534...|[0.99999999999993...|       0.0|
|  0.0|(99,[0,1,

#### MLP

In [103]:
mlp = classification.MultilayerPerceptronClassifier(maxIter=100, layers=[99,40,2])

In [104]:
mlpModel = mlp.fit(df_train)

In [105]:
mlpModel.layers

Param(parent='MultilayerPerceptronClassifier_5f0b5c493611', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.')

In [106]:
mlpModel.weights

DenseVector([-1.0671, 0.27, -0.3549, -0.2945, 1.1093, -0.0821, -0.2439, 0.302, 0.0752, 1.0083, 0.2286, 0.044, 0.3389, 0.1533, 1.2442, 0.2273, 0.2148, -0.0468, 0.6193, 1.1745, 0.1435, 0.1056, 0.3377, 1.0465, -2.5544, -1.6978, -0.4793, 0.0997, 0.1265, 0.334, 0.2006, -1.3692, 0.2371, 0.3368, 0.0551, -1.373, -0.1831, 0.0033, 0.1056, -1.0313, 0.4798, 0.4473, 0.1245, 0.211, -0.007, -0.0207, -0.2581, 0.1667, 0.0885, 0.2772, -0.1001, -0.2659, 0.321, 0.1402, -0.3222, -0.2475, 0.2717, 0.4239, -0.4958, -0.5229, 0.0872, 0.7181, 0.6511, -0.6091, 0.6329, 0.0465, -0.3931, 0.3194, -0.09, -0.0755, 0.1324, 0.0456, -0.1914, 0.2728, -0.16, 0.3196, -0.4063, 0.1305, -0.2487, 0.4083, -1.1389, 0.9064, -0.0827, -0.1164, 0.5197, 1.9687, -0.1996, 0.2663, 0.2783, 0.5277, 0.3139, -0.4684, -0.4544, 0.422, 1.4187, -0.3185, -0.134, -0.9431, -0.7317, -0.6616, -0.0355, 0.237, -1.0831, -0.4172, -0.5244, -0.7323, 0.0728, 0.0775, 0.071, 0.0695, -0.3629, -0.7388, 2.9464, -0.3528, 1.4286, -2.0774, -0.0708, -0.2866, -0.0131,

In [107]:
# predykcje
mlpModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,4,8,20,2...|[7.41536998438081...|[0.99999882436768...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.63521443786232...|[0.99999945914416...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.58377098660055...|[0.99999940897323...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.51550052258958...|[0.99999933642126...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[6.91496794072174...|[0.99999819394502...|       0.0|
|  0.0|(99,[0,1,2,4,8,20...|[0.90912949435139...|[0.80181764652514...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.28986123910508...|[0.99999872221638...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.04133082816057...|[0.99999814212977...|       0.0|
|  0.0|(99,[0,1,4,8,20,2...|[7.04133082816057...|[0.99999814212977...|       0.0|
|  0.0|(99,[0,1,

#### Ewaluacja

In [108]:
from pyspark.ml import evaluation

In [109]:
evaluator = evaluation.BinaryClassificationEvaluator()

In [110]:
# AUC - regresja
evaluator.evaluate(lrModel.transform(df_eval))

0.9067511852811231

In [111]:
# AUC - SVM
evaluator.evaluate(svmModel.transform(df_eval))

0.9027953886180846

In [112]:
# AUC - drzewo decyzyjne
evaluator.evaluate(treeModel.transform(df_eval))

0.7603029166772103

In [113]:
# AUC - las losowy
evaluator.evaluate(forestModel.transform(df_eval))

0.8908054160618556

In [114]:
# AUC - gbt
evaluator.evaluate(gbtModel.transform(df_eval))

0.9091128909817424

In [115]:
# AUC - NB
evaluator.evaluate(bayesModel.transform(df_eval))

0.5523512963904423

In [116]:
# AUC - MLP
evaluator.evaluate(mlpModel.transform(df_eval))

0.8945868424018953

> **ZADANIE:**
- napisz funkcję do obliczania `accuracy`
- oblicz `accuracy` powyższych modeli

In [117]:
def calculate_acc(df, label="label", prediction="prediction"):
    temp = df.select(f.when(df[label] == df[prediction], 1).otherwise(0).alias("same"))
    return temp.select(f.avg("same")).collect()[0][0]

#def calculate_acc(df, label="label", prediction="prediction"):
#    out = df.select((f.col(prediction) == f.col(label)).cast("int").alias("x")).agg(f.avg("x"))
#    return out.collect()[0][0]

In [118]:
calculate_acc(lrModel.transform(df_eval))

0.8526739624199876

In [119]:
calculate_acc(svmModel.transform(df_eval))

0.8526739624199876

In [120]:
calculate_acc(treeModel.transform(df_eval))

0.840491430931241

In [121]:
calculate_acc(forestModel.transform(df_eval))

0.8317158785876523

In [122]:
calculate_acc(gbtModel.transform(df_eval))

0.8539128639273178

In [123]:
calculate_acc(bayesModel.transform(df_eval))

0.7901094362998141

In [124]:
calculate_acc(mlpModel.transform(df_eval))

0.8489572578979971

> **ZADANIE:**
- popraw `accuracy` dwóch modeli

In [125]:
tree2 = classification.DecisionTreeClassifier(maxDepth=8)
calculate_acc(tree2.fit(df_train).transform(df_eval))

0.8430724757381788

In [126]:
forest2 = classification.RandomForestClassifier(numTrees=30, maxDepth=8)
calculate_acc(forest2.fit(df_train).transform(df_eval))

0.8430724757381788

### Regresja

In [127]:
from pyspark.ml import regression

https://archive.ics.uci.edu/ml/datasets/wine+quality

In [128]:
wine_red = spark.read.csv("./winequality-red.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(0))
wine_white = spark.read.csv("./winequality-white.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(1))

In [129]:
wine = wine_red.union(wine_white)

In [130]:
cols = [col.replace(" ", "_") for col in wine.columns]
cols

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality',
 'type']

In [131]:
wine = wine.toDF(*cols)

In [132]:
wine.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- type: integer (nullable = false)



> **ZADANIE:**
- przygotuj dane
- podziel `wine` na zbiór treningowy i ewaluacyjny
- usuń wiersze zawierające braki danych
- zmień nazwę kolumny `quality` na `label`
- z pozostałych zmiennych stwórz (przeskalowaną) kolumnę `features` zawierającą wektory
- wynikowym DFom nadaj nazwę `wine_train` i `wine_eval`

In [133]:
wine_t, wine_e = wine.dropna("any").randomSplit([0.7, 0.3], 42)

In [134]:
rf = feature.RFormula(formula="quality ~ .", featuresCol='featuresRaw')
rfModel = rf.fit(wine_t)

In [135]:
wine_train = rfModel.transform(wine_t)
wine_eval = rfModel.transform(wine_e)

In [136]:
scaler = feature.StandardScaler(inputCol="featuresRaw", outputCol="features")
scal_mod = scaler.fit(wine_train)

In [137]:
wine_train = scal_mod.transform(wine_train)
wine_eval = scal_mod.transform(wine_eval)

**Ostatnie przygotowania**

In [138]:
wine_train = wine_train.select("label", "features")
wine_eval = wine_eval.select("label", "features")

In [139]:
wine_train.cache()
wine_eval.cache()

DataFrame[label: double, features: vector]

In [140]:
print("Train:")
wine_train.describe("label").show()
print("Eval:")
wine_eval.describe("label").show()

Train:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              4631|
|   mean|5.8168862016843015|
| stddev|0.8791709597478697|
|    min|               3.0|
|    max|               9.0|
+-------+------------------+

Eval:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              1866|
|   mean| 5.822079314040729|
| stddev|0.8586198577914707|
|    min|               3.0|
|    max|               9.0|
+-------+------------------+



#### Regresja liniowa

In [141]:
reg = regression.LinearRegression(maxIter=500)

In [142]:
regModel = reg.fit(wine_train)

In [143]:
regModel.coefficients

DenseVector([0.1008, -0.2373, -0.0113, 0.2762, -0.0232, 0.0946, -0.0873, -0.2824, 0.0662, 0.1057, 0.2789, -0.1337])

In [144]:
regModel.intercept

94.42480103621459

In [145]:
trainSummary = regModel.summary
type(trainSummary)

pyspark.ml.regression.LinearRegressionTrainingSummary

In [146]:
trainSummary.meanAbsoluteError

0.5735071995098387

In [147]:
trainSummary.meanSquaredError

0.5486947582555972

In [148]:
trainSummary.r2

0.2899679826341164

In [149]:
# predykcje
regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|  7.0|[3.79978170064518...| 7.088596128739368|
|  6.0|[3.87732826596447...| 6.156884433719114|
|  5.0|[3.87732826596447...| 5.333178584402347|
|  7.0|[3.95487483128376...| 6.914388932225606|
|  5.0|[4.03242139660305...| 5.379341371914023|
|  6.0|[4.03242139660305...| 7.250840950567991|
|  6.0|[4.03242139660305...| 7.250840950567991|
|  7.0|[4.10996796192234...| 6.874580727317081|
|  7.0|[4.10996796192234...| 6.332254543366574|
|  7.0|[4.18751452724163...| 6.355781707454639|
|  6.0|[4.18751452724163...| 5.561944831913692|
|  8.0|[4.26506109256092...| 7.051003659608256|
|  5.0|[4.34260765788021...|5.3602766106535995|
|  6.0|[4.34260765788021...| 6.242332258205508|
|  5.0|[4.34260765788021...| 6.405827801431869|
|  5.0|[4.34260765788021...| 5.969597420460488|
|  5.0|[4.34260765788021...| 6.368553108955581|
|  7.0|[4.34260765788021...| 6.193197654

#### Drzewo regresyjne

In [150]:
tree_reg = regression.DecisionTreeRegressor()

In [151]:
tree_regModel = tree_reg.fit(wine_train)

In [152]:
print(tree_regModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_33f33bfc5d1b, depth=5, numNodes=63, numFeatures=12
  If (feature 10 <= 9.113586134636481)
   If (feature 1 <= 1.5244450149933235)
    If (feature 1 <= 1.2527617449945132)
     If (feature 7 <= 329.8139994767736)
      If (feature 0 <= 6.552684769479971)
       Predict: 5.988207547169812
      Else (feature 0 > 6.552684769479971)
       Predict: 5.321428571428571
     Else (feature 7 > 329.8139994767736)
      If (feature 2 <= 2.1067668075847323)
       Predict: 7.085106382978723
      Else (feature 2 > 2.1067668075847323)
       Predict: 6.0625
    Else (feature 1 > 1.2527617449945132)
     If (feature 9 <= 3.0223311418752203)
      If (feature 10 <= 7.3882660114381355)
       Predict: 5.2
      Else (feature 10 > 7.3882660114381355)
       Predict: 5.565217391304348
     Else (feature 9 > 3.0223311418752203)
      If (feature 10 <= 8.710083847759474)
       Predict: 5.7406143344709895
      Else (feature 10 > 8.710083847759474)
  

In [153]:
# predykcje
tree_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|  7.0|[3.79978170064518...| 6.794520547945205|
|  6.0|[3.87732826596447...| 5.605263157894737|
|  5.0|[3.87732826596447...| 6.357142857142857|
|  7.0|[3.95487483128376...| 6.794520547945205|
|  5.0|[4.03242139660305...|5.3056768558951966|
|  6.0|[4.03242139660305...|6.8428571428571425|
|  6.0|[4.03242139660305...|6.8428571428571425|
|  7.0|[4.10996796192234...| 6.794520547945205|
|  7.0|[4.10996796192234...| 6.794520547945205|
|  7.0|[4.18751452724163...|6.8428571428571425|
|  6.0|[4.18751452724163...| 5.477611940298507|
|  8.0|[4.26506109256092...|6.8428571428571425|
|  5.0|[4.34260765788021...|5.3056768558951966|
|  6.0|[4.34260765788021...|  5.59070796460177|
|  5.0|[4.34260765788021...| 5.605263157894737|
|  5.0|[4.34260765788021...| 5.066666666666666|
|  5.0|[4.34260765788021...| 5.605263157894737|
|  7.0|[4.34260765788021...| 5.605263157

#### Las regresyjny

In [154]:
forest_reg = regression.RandomForestRegressor()

In [155]:
forest_regModel = forest_reg.fit(wine_train)

In [156]:
forest_regModel.featureImportances

SparseVector(12, {0: 0.0196, 1: 0.1066, 2: 0.0636, 3: 0.0355, 4: 0.095, 5: 0.0791, 6: 0.0257, 7: 0.1848, 8: 0.0181, 9: 0.0467, 10: 0.3219, 11: 0.0035})

In [157]:
print(forest_regModel.toDebugString)

RandomForestRegressionModel: uid=RandomForestRegressor_4c4bc072f96c, numTrees=20, numFeatures=12
  Tree 0 (weight 1.0):
    If (feature 7 <= 328.1118061748356)
     If (feature 10 <= 9.113586134636481)
      If (feature 9 <= 3.4873051637021772)
       If (feature 8 <= 19.158076918446746)
        If (feature 6 <= 0.9561765394860922)
         Predict: 4.333333333333333
        Else (feature 6 > 0.9561765394860922)
         Predict: 5.2682926829268295
       Else (feature 8 > 19.158076918446746)
        If (feature 6 <= 2.201837719367056)
         Predict: 5.987179487179487
        Else (feature 6 > 2.201837719367056)
         Predict: 5.458333333333333
      Else (feature 9 > 3.4873051637021772)
       If (feature 3 <= 0.3373443572826443)
        If (feature 10 <= 8.111787353424543)
         Predict: 5.0
        Else (feature 10 > 8.111787353424543)
         Predict: 5.96078431372549
       Else (feature 3 > 0.3373443572826443)
        If (feature 6 <= 1.8334027225008556)
         Predic

In [158]:
# predykcje
forest_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|  7.0|[3.79978170064518...| 6.695790613496891|
|  6.0|[3.87732826596447...| 6.194609310836868|
|  5.0|[3.87732826596447...| 5.924372793300423|
|  7.0|[3.95487483128376...| 6.649688185470002|
|  5.0|[4.03242139660305...| 5.244696842729085|
|  6.0|[4.03242139660305...|  6.53209537835568|
|  6.0|[4.03242139660305...|  6.53209537835568|
|  7.0|[4.10996796192234...| 6.564496121177671|
|  7.0|[4.10996796192234...|6.0661385551636995|
|  7.0|[4.18751452724163...| 6.055862165224241|
|  6.0|[4.18751452724163...| 5.485619827145731|
|  8.0|[4.26506109256092...|6.6359642264343375|
|  5.0|[4.34260765788021...| 5.335679430321472|
|  6.0|[4.34260765788021...| 5.572808716830788|
|  5.0|[4.34260765788021...|5.8857095036228815|
|  5.0|[4.34260765788021...|5.5827383329769305|
|  5.0|[4.34260765788021...| 6.120772500329597|
|  7.0|[4.34260765788021...| 5.710344920

#### Gradient-Boosted Trees regression

In [159]:
gbt_reg = regression.GBTRegressor()

In [160]:
gbt_regModel = gbt_reg.fit(wine_train)

In [161]:
gbt_regModel.featureImportances

SparseVector(12, {0: 0.0813, 1: 0.1014, 2: 0.0625, 3: 0.0714, 4: 0.0766, 5: 0.0978, 6: 0.1021, 7: 0.0487, 8: 0.0867, 9: 0.1064, 10: 0.1651})

In [162]:
print(gbt_regModel.toDebugString)

GBTRegressionModel: uid=GBTRegressor_f6eacdaab6ab, numTrees=20, numFeatures=12
  Tree 0 (weight 1.0):
    If (feature 10 <= 9.113586134636481)
     If (feature 1 <= 1.5244450149933235)
      If (feature 1 <= 1.2527617449945132)
       If (feature 7 <= 329.8139994767736)
        If (feature 0 <= 6.552684769479971)
         Predict: 5.988207547169812
        Else (feature 0 > 6.552684769479971)
         Predict: 5.321428571428571
       Else (feature 7 > 329.8139994767736)
        If (feature 2 <= 2.1067668075847323)
         Predict: 7.085106382978723
        Else (feature 2 > 2.1067668075847323)
         Predict: 6.0625
      Else (feature 1 > 1.2527617449945132)
       If (feature 9 <= 3.0223311418752203)
        If (feature 10 <= 7.3882660114381355)
         Predict: 5.2
        Else (feature 10 > 7.3882660114381355)
         Predict: 5.565217391304348
       Else (feature 9 > 3.0223311418752203)
        If (feature 10 <= 8.710083847759474)
         Predict: 5.7406143344709895
      

In [163]:
# predykcje
gbt_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|  7.0|[3.79978170064518...| 6.990817372018714|
|  6.0|[3.87732826596447...| 6.083849171585133|
|  5.0|[3.87732826596447...|5.9806339360185135|
|  7.0|[3.95487483128376...| 6.985277089162348|
|  5.0|[4.03242139660305...|  5.20164845512001|
|  6.0|[4.03242139660305...| 6.896515093351613|
|  6.0|[4.03242139660305...| 6.896515093351613|
|  7.0|[4.10996796192234...| 6.953975621909426|
|  7.0|[4.10996796192234...| 6.927208695053672|
|  7.0|[4.18751452724163...| 6.774395135531267|
|  6.0|[4.18751452724163...| 5.458805532864423|
|  8.0|[4.26506109256092...| 7.064131040385138|
|  5.0|[4.34260765788021...|5.0328793316093785|
|  6.0|[4.34260765788021...|   5.4731954147849|
|  5.0|[4.34260765788021...| 5.675287677020734|
|  5.0|[4.34260765788021...| 5.047261306040631|
|  5.0|[4.34260765788021...| 5.734514234760661|
|  7.0|[4.34260765788021...|5.8438971901

#### Ewaluacja

In [164]:
evaluator_reg = evaluation.RegressionEvaluator()

In [165]:
# rmse - regresja
evaluator_reg.evaluate(regModel.transform(wine_eval))

0.7116825029687969

In [166]:
# rmse - drzewo
evaluator_reg.evaluate(tree_regModel.transform(wine_eval))

0.7188322423963592

In [167]:
# rmse - las
evaluator_reg.evaluate(forest_regModel.transform(wine_eval))

0.7055068730928302

In [168]:
# rmse - gbt
evaluator_reg.evaluate(gbt_regModel.transform(wine_eval))

0.6874828494648341

> **ZADANIE:**
- oblicz `MSE` oraz `R^2` powyższych modeli

In [169]:
er = evaluation.RegressionEvaluator(metricName="mse")

In [170]:
er.evaluate(regModel.transform(wine_eval))

0.5064919850319316

In [171]:
er.evaluate(tree_regModel.transform(wine_eval))

0.5167197927085782

In [172]:
er.evaluate(forest_regModel.transform(wine_eval))

0.49773994798122284

In [173]:
er.evaluate(gbt_regModel.transform(wine_eval))

0.4726326683082877

In [174]:
er2 = evaluation.RegressionEvaluator(metricName="r2")

In [175]:
er2.evaluate(regModel.transform(wine_eval))

0.31260950326476844

In [176]:
er2.evaluate(tree_regModel.transform(wine_eval))

0.2987287351437896

In [177]:
er2.evaluate(forest_regModel.transform(wine_eval))

0.32448741500576583

In [178]:
er2.evaluate(gbt_regModel.transform(wine_eval))

0.3585620024742352

> **ZADANIE:**
- popraw `R^2` jednego modelu

In [179]:
forest_reg2 = regression.RandomForestRegressor(numTrees=30, maxDepth=8)
er2.evaluate(forest_reg2.fit(wine_train).transform(wine_eval))

0.39876337069328527

### Wybór najlepszych parametrów

In [180]:
from pyspark.ml import tuning

In [181]:
reg2 = regression.LinearRegression()

In [182]:
grid = tuning.ParamGridBuilder() \
.addGrid(reg2.maxIter, [100, 500, 1000]) \
.addGrid(reg2.regParam, [0.0, 0.1, 0.2]).build()

In [183]:
reg_eval = evaluation.RegressionEvaluator(metricName='r2')

In [184]:
cv = tuning.CrossValidator(estimator=reg2, estimatorParamMaps=grid, evaluator=reg_eval, parallelism=2)

In [185]:
cvModel = cv.fit(wine_train)

In [186]:
cvModel.avgMetrics

[0.2824598220135613,
 0.2785704303888989,
 0.26997962204860004,
 0.2824598220135613,
 0.2785704303888989,
 0.26997962204860004,
 0.2824598220135613,
 0.2785704303888989,
 0.26997962204860004]

Parametry najlepszego modelu

In [187]:
cvModel.bestModel._java_obj.getMaxIter()

100

In [188]:
cvModel.bestModel._java_obj.getRegParam()

0.0

In [189]:
reg_eval.evaluate(cvModel.transform(wine_eval))

0.31260950326476844

zapisanie i wczytanie modelu

In [190]:
cvModel.save("model")

In [191]:
readInModel = tuning.CrossValidatorModel.load("model")

In [192]:
reg_eval.evaluate(readInModel.transform(wine_eval))

0.31260950326476844

### Pipeline + wybór najlepszych parametrów

In [193]:
cols = [x for x in wine_t.columns if x != "quality"]
vectA = feature.VectorAssembler()#inputCols = cols, outputCol = "featuresRaw")
scal = feature.StandardScaler()#inputCol="featuresRaw", outputCol="features")
forestReg = regression.RandomForestRegressor()
pipe = Pipeline(stages=[vectA, scal, forestReg])

In [194]:
# baseOn - sposob podawania stalego parametru do wszystkich kombinacji
paramGrid = tuning.ParamGridBuilder() \
.baseOn([vectA.inputCols, cols]) \
.baseOn([vectA.outputCol, 'featuresRaw']) \
.baseOn([scal.inputCol, 'featuresRaw']) \
.baseOn([scal.outputCol, 'features']) \
.addGrid(scal.withMean, [False, True]) \
.baseOn([forestReg.labelCol, 'quality']) \
.addGrid(forestReg.maxDepth, [5, 6, 7, 8]) \
.build()

In [195]:
regr_eval = evaluation.RegressionEvaluator(labelCol= "quality", metricName='r2')

In [196]:
# utworzenie estymatora
crossval = tuning.CrossValidator(estimator=pipe,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=regr_eval,
                                 numFolds=4, 
                                 parallelism=2)

In [197]:
# utworzenie transformera
crossvalModel = crossval.fit(wine_t)

In [198]:
crossvalModel.avgMetrics

[0.32924215487207475,
 0.35387672514404805,
 0.37599797269682766,
 0.3903316130707088,
 0.3292253822838642,
 0.35385760515212833,
 0.37597992338995123,
 0.39033840396115105]

In [199]:
crossvalModel.bestModel.stages[2]._java_obj.getMaxDepth()

8

In [200]:
crossvalModel.bestModel.stages[1]._java_obj.getWithMean()

True

In [201]:
# transformacja (predykcja)
crossvalModel.transform(wine_e).drop(*cols).show()

+-------+--------------------+--------------------+------------------+
|quality|         featuresRaw|            features|        prediction|
+-------+--------------------+--------------------+------------------+
|      7|[4.9,0.42,0.0,2.1...|[-1.8077593015455...| 7.012907297528597|
|      6|[5.0,0.74,0.0,1.2...|[-1.7302127362262...| 6.144127212117214|
|      5|[5.0,1.04,0.24,1....|[-1.7302127362262...| 5.877639375577258|
|      7|[5.1,0.42,0.0,1.8...|[-1.6526661709069...| 7.002149721771021|
|      5|[5.2,0.32,0.25,1....|[-1.5751196055876...| 5.029851806884647|
|      6|[5.2,0.34,0.0,1.8...|[-1.5751196055876...|6.8944382246317115|
|      6|[5.2,0.34,0.0,1.8...|[-1.5751196055876...|6.8944382246317115|
|      7|[5.3,0.47,0.11,2....|[-1.4975730402683...| 6.940133007089528|
|      7|[5.3,0.57,0.01,1....|[-1.4975730402683...| 6.299282669589575|
|      7|[5.4,0.42,0.27,2....|[-1.4200264749490...|  6.17754857043367|
|      6|[5.4,0.58,0.08,1....|[-1.4200264749490...|5.4685950967730905|
|     

In [202]:
regr_eval.evaluate(crossvalModel.transform(wine_e))

0.38991906112478647

**Alternatywne ścieżki w pipeline**

In [203]:
cols = [x for x in wine_t.columns if x != "quality"]
vectA = feature.VectorAssembler()
scal = feature.StandardScaler() # scaler lub
scalM = feature.MinMaxScaler() # scaler
forestReg = regression.RandomForestRegressor()
pipe = Pipeline(stages=[vectA, scal, scalM, forestReg])

In [204]:
paramGrid = tuning.ParamGridBuilder() \
.baseOn([vectA.inputCols, cols]) \
.baseOn([vectA.outputCol, 'featuresRaw']) \
.baseOn([scal.inputCol, 'featuresRaw']) \
.baseOn([scal.outputCol, 'features']) \
.baseOn([scalM.inputCol, 'featuresRaw']) \
.baseOn([scalM.outputCol, 'featuresM']) \
.baseOn([forestReg.labelCol, 'quality']) \
.addGrid(forestReg.featuresCol, ["features", "featuresM"]) \
.addGrid(forestReg.maxDepth, [5, 6, 7, 8]) \
.build()

In [205]:
regr_eval = evaluation.RegressionEvaluator(labelCol= "quality", metricName='r2')

In [206]:
crossval = tuning.CrossValidator(estimator=pipe,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=regr_eval,
                                 numFolds=4, 
                                 parallelism=2)

In [207]:
crossvalModel = crossval.fit(wine_t)

In [208]:
crossvalModel.avgMetrics

[0.32924215487207475,
 0.35387672514404805,
 0.37599797269682766,
 0.3903316130707088,
 0.3292253822838642,
 0.35385760515212833,
 0.37597992338995123,
 0.39033840396115105]

In [209]:
crossvalModel.bestModel.stages[3]._java_obj.getMaxDepth()

8

In [210]:
crossvalModel.bestModel.stages[3]._java_obj.getFeaturesCol()

'featuresM'

> **ZADANIE:**
- stwórz model jak najlepiej przewidujący liczbę pierścieni (wiek) mięczaków
- do problemu można podejść jak do regresji lub jak do klasyfikacji
- wszystkie chwyty dozwolone

http://archive.ics.uci.edu/ml/datasets/Abalone

In [211]:
colNames = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", 
            "Viscera_weight", "Shell_weight", "Rings"]

In [212]:
abalone = spark.read.csv("./abalone.data", header=False, inferSchema=True)

In [213]:
abalone = abalone.select(*[f.col(old).alias(new) for old, new in zip(abalone.columns, colNames)])

In [214]:
abalone.show()

+---+------+--------+------+------------+--------------+--------------+------------+-----+
|Sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight|Shell_weight|Rings|
+---+------+--------+------+------------+--------------+--------------+------------+-----+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|         0.101|        0.15|   15|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|        0.0485|        0.07|    7|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|        0.1415|        0.21|    9|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|         0.114|       0.155|   10|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|        0.0395|       0.055|    7|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|        0.0775|        0.12|    8|
|  F|  0.53|   0.415|  0.15|      0.7775|         0.237|        0.1415|        0.33|   20|
|  F| 0.545|   0.425| 0.125|       0.768|         0.294|        0.1495|        0.26|   16|

In [215]:
abalone.drop("Sex", "Rings", "Viscera_weight", "Shell_weight").describe().show()

+-------+-------------------+-------------------+-------------------+-------------------+-------------------+
|summary|             Length|           Diameter|             Height|       Whole_weight|     Shucked_weight|
+-------+-------------------+-------------------+-------------------+-------------------+-------------------+
|  count|               4177|               4177|               4177|               4177|               4177|
|   mean| 0.5239920995930099|  0.407881254488869| 0.1395163993296614|   0.82874215944458|0.35936748862820106|
| stddev|0.12009291256479936|0.09923986613365941|0.04182705660725731|0.49038901823099795|0.22196294903322014|
|    min|              0.075|              0.055|                0.0|              0.002|              0.001|
|    max|              0.815|               0.65|               1.13|             2.8255|              1.488|
+-------+-------------------+-------------------+-------------------+-------------------+-------------------+



In [216]:
abalone.describe("Rings", "Viscera_weight", "Shell_weight").show()

+-------+------------------+-------------------+-------------------+
|summary|             Rings|     Viscera_weight|       Shell_weight|
+-------+------------------+-------------------+-------------------+
|  count|              4177|               4177|               4177|
|   mean| 9.933684462532918|0.18059360785252604|0.23883085946851795|
| stddev|3.2241690320681315|0.10961425025968445|0.13920266952238622|
|    min|                 1|             5.0E-4|             0.0015|
|    max|                29|               0.76|              1.005|
+-------+------------------+-------------------+-------------------+



In [217]:
abalone_train, abalone_eval = abalone.randomSplit([0.7, 0.3], 42)

In [218]:
abalone_train.groupBy("Rings").count().orderBy("Rings").show(30)

+-----+-----+
|Rings|count|
+-----+-----+
|    2|    1|
|    3|   11|
|    4|   39|
|    5|   79|
|    6|  185|
|    7|  273|
|    8|  410|
|    9|  486|
|   10|  464|
|   11|  343|
|   12|  194|
|   13|  149|
|   14|   95|
|   15|   74|
|   16|   54|
|   17|   40|
|   18|   29|
|   19|   24|
|   20|   17|
|   21|   11|
|   22|    5|
|   23|    5|
|   24|    1|
|   25|    1|
|   27|    1|
|   29|    1|
+-----+-----+



------

In [219]:
input_cols = colNames[:-1]
input_cols_expr = ", ".join(input_cols)
input_cols_expr

'Sex, Length, Diameter, Height, Whole_weight, Shucked_weight, Viscera_weight, Shell_weight'

{1: "<=5", 14: "14-17", 18: ">=18"}

In [220]:
q = ("SELECT case when Rings <= 5 then 1 when Rings >= 14 and Rings <= 17 then 14 "
     "when Rings >= 18 and Rings < 100 then 18 else Rings end Rings, " 
     f"{input_cols_expr} FROM __THIS__")

In [221]:
sqlT = feature.SQLTransformer()
rf = feature.RFormula()
scaler = feature.StandardScaler()
reg_log = classification.LogisticRegression()

pipe = Pipeline(stages=[sqlT, rf, scaler, reg_log])

In [222]:
paramGrid = tuning.ParamGridBuilder()\
.baseOn([sqlT.statement, q])\
.baseOn([rf.formula, "Rings ~ ."])\
.baseOn([rf.featuresCol, 'featuresRaw'])\
.baseOn([scaler.inputCol, 'featuresRaw'])\
.baseOn([scaler.outputCol, 'features'])\
.addGrid(scaler.withMean, [False, True])\
.addGrid(scaler.withStd, [False, True])\
.addGrid(reg_log.maxIter, [100, 200, 300])\
.addGrid(reg_log.regParam, [0.0, 0.1, 0.2])\
.addGrid(reg_log.elasticNetParam, [0.0, 0.1, 0.2])\
.build()

In [223]:
evaluator = evaluation.MulticlassClassificationEvaluator()

In [224]:
crossval = tuning.CrossValidator(estimator=pipe, estimatorParamMaps=paramGrid, 
                                 evaluator=evaluator, parallelism=2)

In [225]:
crossval_model = crossval.fit(abalone_train)

In [226]:
evaluator.evaluate(crossval_model.transform(abalone_eval))

0.28780876008625345