## Spark Machine Learning - Regressions

### 1. Linear Regression

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Jerry-LR").getOrCreate()

In [2]:
lrdf = spark.read.csv("datasets/cruise_ship_info.csv",inferSchema=True,header=True)

In [3]:
lrdf.head(2)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)]

In [4]:
lrdf.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
from pyspark.ml.regression import LinearRegression

In [9]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
from pyspark.ml.feature import StringIndexer

In [10]:
lrdf.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [44]:
lrdf.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [13]:
# As crusie line is a string filed, need to apply StringIndexer to do a bit transformation
cruise_line_indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_line_Indexed")
indexed_lrdf = cruise_line_indexer.fit(lrdf).transform(lrdf)
indexed_lrdf.show(3)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_Indexed|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
only showing top 3 rows



In [14]:
input_cols = ['Cruise_line_Indexed','Tonnage','passengers','length','cabins','passenger_density']

In [22]:
assembler_col = VectorAssembler(inputCols=input_cols,
                                outputCol='features')

In [23]:
# Now we want to transform our data
output = assembler_col.transform(indexed_lrdf)
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- Cruise_line_Indexed: double (nullable = false)
 |-- features: vector (nullable = true)



In [24]:
type(output)

pyspark.sql.dataframe.DataFrame

In [45]:
output.select('features','crew').show(2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,30.27699999...|3.55|
|[16.0,30.27699999...|3.55|
+--------------------+----+
only showing top 2 rows



In [27]:
final_data = output.select('features','crew')
final_data.show(2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,30.27699999...|3.55|
|[16.0,30.27699999...|3.55|
+--------------------+----+
only showing top 2 rows



In [28]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [29]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               126|
|   mean| 8.030000000000005|
| stddev|3.5419034430656064|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [30]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                32|
|   mean| 6.865624999999999|
| stddev|3.2346450545666148|
|    min|               0.6|
|    max|              13.6|
+-------+------------------+



In [31]:
lr = LinearRegression(featuresCol='features',labelCol='crew')
lr_model = lr.fit(train_data)

In [33]:
# See the residual differences
test_results = lr_model.evaluate(test_data)
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| -0.809184752752615|
|-0.5175993257867919|
|-0.5321715901080069|
|-1.2027706856461489|
| -0.311441072908444|
+-------------------+
only showing top 5 rows



In [34]:
test_results.r2

0.9379696431301476

In [35]:
test_results.rootMeanSquaredError

0.7929295976717178

In [46]:
from pyspark.sql.functions import corr

In [47]:
lrdf.select(corr('crew','length')).show()

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+



### Predictions - Using other features to predict crew

In [36]:
unlabel_data = test_data.select('features')

In [37]:
unlabel_data.show(3)

+--------------------+
|            features|
+--------------------+
|[0.0,70.0,18.0,8....|
|[0.0,73.192,28.52...|
|[0.0,90.09,25.01,...|
+--------------------+
only showing top 3 rows



In [38]:
predictions = lr_model.transform(unlabel_data)
predictions.show(10)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,70.0,18.0,8....| 8.009184752752615|
|[0.0,73.192,28.52...| 8.597599325786792|
|[0.0,90.09,25.01,...| 9.222171590108006|
|[0.0,138.0,31.14,...|13.052770685646149|
|[0.0,158.0,43.7,1...|13.911441072908444|
|[1.0,70.367,20.52...| 8.672059775478354|
|[2.0,44.348,12.0,...| 5.676003927044409|
|[3.0,55.451,12.66...| 5.899515723486449|
|[3.0,63.0,14.4,7....| 6.694111069427109|
|[4.0,28.0,11.5,6....| 3.599145727819202|
+--------------------+------------------+
only showing top 10 rows



In [48]:
type(predictions)

pyspark.sql.dataframe.DataFrame