Loading the pyspark session and the data

In [69]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [70]:
df = spark.read.csv('cruise_ship_info.csv', header = True, inferSchema = True)

In [71]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [72]:
df.describe().toPandas()

Unnamed: 0,summary,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,Infinity,,15.689873417721518,71.28467088607599,18.45740506329114,8.130632911392404,8.830000000000005,39.90094936708861,7.794177215189873
2,stddev,,,7.615691058751413,37.229540025907866,9.677094775143416,1.793473548054825,4.4714172221480615,8.63921711391542,3.503486564627034
3,min,Adventure,Azamara,4.0,2.329,0.66,2.79,0.33,17.7,0.59
4,max,Zuiderdam,Windstar,48.0,220.0,54.0,11.82,27.0,71.43,21.0


Importing stringindexer to index cruisle_line as a number

In [73]:
from pyspark.ml.feature import StringIndexer

In [74]:
lineIndexed = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_ind')

In [75]:
newdf = lineIndexed.fit(df).transform(df)

assembling the features as one column

In [76]:
from pyspark.ml.feature import VectorAssembler

In [77]:
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','Cruise_line_ind'], outputCol='features') #tengo las columnas de entrada y la pongo en una de salida, que será luego la de salida del algoritmo de aprendizaje.
output = assembler.transform(newdf)

In [78]:
final_data = output.select(['features', 'crew'])

In [79]:
train_data, test_data = final_data.randomSplit([0.7, 0.3],seed=181120)

In [80]:
final_data.describe().show() 

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [81]:
train_data.describe().show() 

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               102|
|   mean| 7.622450980392167|
| stddev|3.3542015727505996|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



In [82]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               56|
|   mean|8.106964285714287|
| stddev|3.771373957322984|
|    min|             0.88|
|    max|             21.0|
+-------+-----------------+



In [83]:
from pyspark.ml.regression import LinearRegression

In [84]:
lr = LinearRegression(labelCol='crew')

In [85]:
lr_model = lr.fit(train_data)

In [86]:
test_results = lr_model.evaluate(test_data) 

In [87]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.29318854988315834|
|  0.7509607337559192|
| 0.38491960984270435|
|  0.3280766604255767|
| -0.3577624591654356|
|  0.9635488882175807|
| -0.5912661235950694|
|  1.0550199702207852|
|-0.09292801197328693|
|-0.20954427350200078|
|-0.43934298181908993|
| -1.2003627702437765|
| 0.03470852985550543|
|  1.2061418558864343|
|  1.0037796738494258|
| -1.1874558886932753|
|-0.04821601078154192|
|  0.3044061680897121|
|  0.9412710276839746|
|  1.0413591632305383|
+--------------------+
only showing top 20 rows



Metrics for the model that includes the cruise_line

In [88]:
test_results.rootMeanSquaredError

0.9721088264714621

In [89]:
test_results.r2

0.9323518514799048

In [90]:
unlabeled_data = test_data.select('features')

In [91]:
predictions = lr_model.transform(unlabeled_data)

In [92]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...| 21.29318854988316|
|[6.0,93.0,23.94,9...| 10.33903926624408|
|[6.0,110.23899999...|11.115080390157296|
|[6.0,113.0,37.82,...|11.671923339574423|
|[7.0,158.0,43.7,1...|13.957762459165435|
|[9.0,88.5,21.24,9...|  9.33645111178242|
|[9.0,105.0,27.2,8...|11.271266123595069|
|[9.0,113.0,26.74,...|11.324980029779216|
|[10.0,81.76899999...| 8.512928011973287|
|[10.0,86.0,21.14,...|    9.409544273502|
|[10.0,110.0,29.74...| 12.03934298181909|
|[10.0,138.0,31.14...|13.050362770243776|
|[11.0,86.0,21.24,...| 9.265291470144495|
|[11.0,90.0,22.4,9...| 9.793858144113566|
|[11.0,91.0,20.32,...| 8.986220326150574|
|[11.0,138.0,31.14...|13.037455888693275|
|[12.0,25.0,3.88,5...| 2.918216010781542|
|[12.0,42.0,14.8,7...| 6.495593831910288|
|[12.0,77.104,20.0...| 8.648728972316025|
|[12.0,88.5,21.24,...|  9.24864083676946|
+--------------------+------------

In [93]:
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density'], outputCol='features') #tengo las columnas de entrada y la pongo en una de salida, que será luego la de salida del algoritmo de aprendizaje.
output = assembler.transform(df)

In [94]:
final_data = output.select(['features', 'crew'])

In [95]:
train_data, test_data = final_data.randomSplit([0.7, 0.3],seed=181120)

In [96]:
lr = LinearRegression(labelCol='crew')

In [97]:
lr_model = lr.fit(train_data)

In [98]:
test_results = lr_model.evaluate(test_data) 

In [99]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.08880751600474923|
|  0.7896731135407489|
| 0.41550574054428857|
|  0.4220985498253693|
| -0.3442936953349438|
|  0.8271900759383293|
| -0.4726140167545321|
|  1.0151073976529261|
|-0.18849528012283123|
|-0.16393879032423264|
| -0.4381200884023446|
|  -1.288481406644177|
|-0.08742130050547914|
|  1.2298398706864013|
|  1.0427117260243506|
| -1.2729514957825288|
|-0.06725781991495117|
|  0.5046261789850996|
|  0.9390367956134806|
|  0.9108001689593692|
+--------------------+
only showing top 20 rows



Metrics for the model without the cruise line

In [100]:
test_results.rootMeanSquaredError

0.9961861436340924

In [101]:
test_results.r2

0.9289593158444861

As we can see, the r2 for our model without cruise_line is worst than the one where we include it. The same happens with the rootMeanSquaredError. But both of them are still very good models.