In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_reg').getOrCreate()

In [2]:
# Importing the dataset
df = spark.read.csv('/FileStore/tables/cruise_ship_info.csv', inferSchema=True, header=True)

In [3]:
df.show()

In [4]:
# StringIndexer for the categorical column
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_cat')
df = indexer.fit(df).transform(df)

In [5]:
df.show()

In [6]:
df.columns

In [7]:
df.columns[2:8] + df.columns[-1:]

In [8]:
# Transforming the dataframe into a dataframe accepted by PySpark ml algorithms
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=df.columns[2:8] + df.columns[-1:], outputCol='features')
df_final = vectorAssembler.transform(df).select('features', 'crew')

In [9]:
df_final.show()

In [10]:
# Train test split
train_data, test_data = df_final.randomSplit([0.7, 0.3])

In [11]:
train_data.describe().show()

In [12]:
test_data.describe().show()

In [13]:
# Fitting the model
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='features', labelCol='crew', predictionCol='predictions')
regressor = regressor.fit(train_data)

In [14]:
# Doing some predictions
test_results = regressor.evaluate(test_data)

In [15]:
test_results.rootMeanSquaredError

In [16]:
test_results.r2

In [17]:
df.describe().show()

In [18]:
from pyspark.sql.functions import corr
df.select(corr('crew', 'passengers')).show()

In [19]:
df.select(corr('crew', 'cabins')).show()

In [20]:
df.select(corr('crew', 'Tonnage')).show()

In [21]:
df.select(corr('crew', 'Age')).show()

In [22]:
# Predicting on unlabeled test set
unlabeled_data = test_data.select('features')
predictions = regressor.transform(unlabeled_data)

In [23]:
predictions.select('predictions').head(10)

In [24]:
test_data.select('crew').head(10)