# Hyundai Ships Crew Estimates Using Linear Regression & PySpark

##Linear Regression on a Hyundai Heavy Industies dataset of ships to estimate crew members needed for a new line of ships created for different customers

In [0]:
# Create a Spark session and import the ships dataset
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('proj').getOrCreate()
data=spark.read.csv('dbfs:/FileStore/shared_uploads/hrishagni95@gmail.com/cruise_ship_info.csv',inferSchema=True,header=True)

In [0]:
# Summarise the dataset
data.describe().show()

In [0]:
# Check the column names of the dataset, so as to know what we are dealing with
data.columns

In [0]:
# Check on the data
data.show()

In [0]:
# Check the schema of the csv file
data.printSchema()

In [0]:
# Import StringIndexer
from pyspark.ml.feature import StringIndexer

In [0]:
# Use StringIndexer to convert the 'Cruise_line' field into a string indexed field
df = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_Line_Index")
df = indexer.fit(data).transform(data)
df.show()

In [0]:
# Import function LinearRegression and VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
# Create a vector assembler using the below fields to generate the features field, which will be crucial for our linear regression in the next stages
assembler=VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density', 
 'Cruise_Line_Index'],outputCol='features')

In [0]:
# Use the vector assembler to transform the indexed dataframe
output_df=assembler.transform(df)

In [0]:
output_df.show()

In [0]:
# Select only the necessary fields for the regression model
final_df=output_df.select('crew','features')

In [0]:
final_df.show()

In [0]:
# Use randomsplit() method to split the dataframe in to training data set and testing data set. Use a 70/30 split ratio.
train_data,test_data=final_df.randomSplit([0.7,0.3])

In [0]:
# Create the LinearRegression model
lr=LinearRegression(labelCol='crew')

In [0]:
# Use the regression model to fit the training data
lr_model=lr.fit(train_data)

In [0]:
# After the training data is fitted on the regression model, use the fit to evaluate the testing data set
res=lr_model.evaluate(test_data)

In [0]:
# Check on the R2 score to understand how well our model has been trained to correctly predict the crew estimates
res.r2

In [0]:
# A R2 score of 0.88 means that we have been quite succesful in our modelling so far

In [0]:
# After model training and testing, we will now check the actual crew estimate that Hyundai is looking for by transforming a unlabeled data set, where the 'crew' field does not exist
unlabeled_data=output_df.select('features')
unlabeled_data.show()

In [0]:
pred=lr_model.transform(unlabeled_data)
pred.show()