# Linear Regression Project Exercise

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Project').getOrCreate()

In [3]:
data = spark.read.csv('cruise_ship_info.csv', header=True, inferSchema=True)

In [4]:
from pyspark.ml.regression import LinearRegression

In [5]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [28]:
for row in data.head(1)[0]:
    print(row)

Journey
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55


In [29]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
indexed = indexer.fit(data).transform(data)
indexed.head(5)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),
 Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),
 Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [32]:
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat']

In [33]:
assembler = VectorAssembler(inputCols=['Age',
                                       'Tonnage',
                                       'passengers',
                                       'length',
                                       'cabins',
                                       'passenger_density',
                                       'cruise_cat'],
                            outputCol='features')


In [35]:
output = assembler.transform(indexed)

In [36]:
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0, features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

In [59]:
final_data = output.select('features', 'crew').show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [39]:
train_data, test_data = final_data.randomSplit([0.75, 0.25])

In [40]:
lr = LinearRegression(labelCol='crew')

In [52]:
model = lr.fit(train_data)

In [53]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(model.coefficients,model.intercept))

Coefficients: [-0.009141581530166998,0.016674861303517044,-0.15258882685697772,0.33898440846323297,0.8490657862319327,0.0009612775275085133,0.04712646762525038] Intercept: -0.9647841394547536


In [42]:
test_results = model.evaluate(test_data)

In [55]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 0.9018707123626378
MSE: 0.8133707818174919
R2: 0.917380997090131


In [57]:
from pyspark.sql.functions import corr
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [58]:
data.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [46]:
final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [47]:
unlabled_data = test_data.select('features')

In [48]:
unlabled_data.show()

+--------------------+
|            features|
+--------------------+
|[5.0,122.0,28.5,1...|
|[5.0,160.0,36.34,...|
|[6.0,158.0,43.7,1...|
|[8.0,91.0,22.44,9...|
|[8.0,110.0,29.74,...|
|[9.0,81.0,21.44,9...|
|[9.0,90.09,25.01,...|
|[9.0,113.0,26.74,...|
|[9.0,116.0,26.0,9...|
|[10.0,81.76899999...|
|[10.0,90.09,25.01...|
|[10.0,91.62700000...|
|[11.0,86.0,21.24,...|
|[11.0,90.09,25.01...|
|[11.0,91.0,20.32,...|
|[12.0,50.0,7.0,7....|
|[12.0,108.865,27....|
|[13.0,76.0,18.74,...|
|[13.0,85.619,21.1...|
|[13.0,138.0,31.14...|
+--------------------+
only showing top 20 rows



In [49]:
predictions = model.transform(unlabled_data)

In [50]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,122.0,28.5,1...| 6.325840527215672|
|[5.0,160.0,36.34,...|15.351764800956136|
|[6.0,158.0,43.7,1...|14.078381266450878|
|[8.0,91.0,22.44,9...|10.080605649687339|
|[8.0,110.0,29.74,...|12.190360330086243|
|[9.0,81.0,21.44,9...|  9.48095551038799|
|[9.0,90.09,25.01,...| 9.223368249249082|
|[9.0,113.0,26.74,...|11.467603533884882|
|[9.0,116.0,26.0,9...|11.318658123728701|
|[10.0,81.76899999...| 8.767589534267378|
|[10.0,90.09,25.01...| 8.840637721776865|
|[10.0,91.62700000...|  9.24652856551638|
|[11.0,86.0,21.24,...| 9.495256951371797|
|[11.0,90.09,25.01...|   8.8314961402467|
|[11.0,91.0,20.32,...|  9.22686165146455|
|[12.0,50.0,7.0,7....| 4.640159228668051|
|[12.0,108.865,27....|10.926217318036914|
|[13.0,76.0,18.74,...| 8.716271280865175|
|[13.0,85.619,21.1...| 9.603112020435088|
|[13.0,138.0,31.14...|13.186089170187488|
+--------------------+------------