## 0. Importing Libraries

In [275]:
# For data manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, count, col

# For data preprocessing
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# For model building
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import LinearRegression

# For model evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import warnings
warnings.filterwarnings('ignore')

## 1. Looking to the Dataset

In [276]:
spark = SparkSession.builder.appName('salarydata').getOrCreate()

In [277]:
df = spark.read.csv('data/salary.csv', header=True, inferSchema=True)

In [278]:
df.show(5)

+---+----+------+---------------+-----------------+-------------------+--------+-------+--------+
|_c0| Age|Gender|Education Level|        Job Title|Years of Experience|  Salary|Country|    Race|
+---+----+------+---------------+-----------------+-------------------+--------+-------+--------+
|  0|32.0|  Male|     Bachelor's|Software Engineer|                5.0| 90000.0|     UK|   White|
|  1|28.0|Female|       Master's|     Data Analyst|                3.0| 65000.0|    USA|Hispanic|
|  2|45.0|  Male|            PhD|   Senior Manager|               15.0|150000.0| Canada|   White|
|  3|36.0|Female|     Bachelor's|  Sales Associate|                7.0| 60000.0|    USA|Hispanic|
|  4|52.0|  Male|       Master's|         Director|               20.0|200000.0|    USA|   Asian|
+---+----+------+---------------+-----------------+-------------------+--------+-------+--------+
only showing top 5 rows



In [279]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Education Level: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Years of Experience: double (nullable = true)
 |-- Salary: double (nullable = true)
 |-- Country: string (nullable = true)
 |-- Race: string (nullable = true)



In [280]:
num_rows = df.count()
num_cols = len(df.columns)

print("Number of rows: %d" % num_rows)
print("Number of columns: %d" % num_cols)

Number of rows: 6704
Number of columns: 9


In [281]:
df.toPandas().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
_c0,6704.0,3351.5,1935.422435,0.0,1675.75,3351.5,5027.25,6703.0
Age,6702.0,33.620859,7.614633,21.0,28.0,32.0,38.0,62.0
Years of Experience,6701.0,8.094687,6.059003,0.0,3.0,7.0,12.0,34.0
Salary,6699.0,115326.964771,52786.183911,350.0,70000.0,115000.0,160000.0,250000.0


Looking at the 'count' column we can see different values in all rows, indicating that there are missing values

## 2. Checking missing values

In [282]:
df.toPandas().isnull().sum()

_c0                    0
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
Country                0
Race                   0
dtype: int64

In [283]:
df = df.na.drop() #drop all rows with any null values

## 3. DataFrame Operations

<div style="padding-left: 20px;">
    <h3>3.1 Checking the Unique Values</h3>
</div>

<div style="padding-left: 20px;">
    <h4>3.1.1 Education Level</h4>
</div>

In [284]:
df.select('Education Level').distinct().show()

+-----------------+
|  Education Level|
+-----------------+
|      High School|
|  Master's Degree|
|Bachelor's Degree|
|              PhD|
|              phD|
|       Bachelor's|
|         Master's|
+-----------------+



Above we see that we have several values that have the same meaning, this can get in the way when analyzing the data, so let's convert them to a single value.

+ Master's Degree --> Master's 
+ Bachelor's Degree --> Bachelor's e 
+ phD --> PhD


In [285]:
df = df.withColumn("Education Level", when(col("Education Level") == "Master's Degree", "Master's").otherwise(col("Education Level"))) \
       .withColumn("Education Level", when(col("Education Level") == "phD", "PhD").otherwise(col("Education Level"))) \
       .withColumn("Education Level", when(col("Education Level") == "Bachelor's Degree", "Bachelor's").otherwise(col("Education Level")))

<div style="padding-left: 20px;">
    <h4>3.1.2 Race</h4>
</div>

In [286]:
df.select('Race').distinct().show()

+----------------+
|            Race|
+----------------+
|         Chinese|
|African American|
|      Australian|
|           Mixed|
|          Korean|
|           White|
|        Hispanic|
|           Black|
|           Asian|
|           Welsh|
+----------------+



<div style="padding-left: 20px;">
    <h4>3.1.3 Country</h4>
</div>

In [287]:
df.select('Country').distinct().show()

+---------+
|  Country|
+---------+
|    China|
|      USA|
|       UK|
|   Canada|
|Australia|
+---------+



<div style="padding-left: 20px;">
    <h4>3.1.4 Gender</h4>
</div>

In [288]:
df.select('Gender').distinct().show()

+------+
|Gender|
+------+
|Female|
| Other|
|  Male|
+------+



## 4. SQL Spark

Renaming columns to avoid spaces, since SQL does not recognize spaces in column names.

In [289]:
df = df.withColumnRenamed("Education Level", "EducationLevel") \
       .withColumnRenamed("Job Title", "JobTitle") \
       .withColumnRenamed("Years of Experience", "Experience") \
       .withColumnRenamed("_c0", "ID")

Using the ```'createOrReplaceTempView'``` command to allow queries on the DataFrame using SQL commands.

In [290]:
df.createOrReplaceTempView("salarydb")

In [291]:
spark.sql('select * from salarydb').show(5,False)

+---+----+------+--------------+-----------------+----------+--------+-------+--------+
|ID |Age |Gender|EducationLevel|JobTitle         |Experience|Salary  |Country|Race    |
+---+----+------+--------------+-----------------+----------+--------+-------+--------+
|0  |32.0|Male  |Bachelor's    |Software Engineer|5.0       |90000.0 |UK     |White   |
|1  |28.0|Female|Master's      |Data Analyst     |3.0       |65000.0 |USA    |Hispanic|
|2  |45.0|Male  |PhD           |Senior Manager   |15.0      |150000.0|Canada |White   |
|3  |36.0|Female|Bachelor's    |Sales Associate  |7.0       |60000.0 |USA    |Hispanic|
|4  |52.0|Male  |Master's      |Director         |20.0      |200000.0|USA    |Asian   |
+---+----+------+--------------+-----------------+----------+--------+-------+--------+
only showing top 5 rows



## 4.1 Count

### 4.1.1 Count by Country

In [292]:
spark.sql("SELECT Country,\
        COUNT(Country) AS count\
        FROM salarydb\
        GROUP BY Country\
        ORDER BY count DESC").show()

+---------+-----+
|  Country|count|
+---------+-----+
|      USA| 1359|
|    China| 1343|
|Australia| 1336|
|       UK| 1335|
|   Canada| 1325|
+---------+-----+



+   The amount of data from each country is similar

### 4.1.2 Count by Race

In [293]:
spark.sql("SELECT Race,\
        COUNT(Race) AS count\
        FROM salarydb\
        GROUP BY Race\
        ORDER BY count DESC").show()

+----------------+-----+
|            Race|count|
+----------------+-----+
|           White| 1962|
|           Asian| 1603|
|          Korean|  457|
|      Australian|  452|
|         Chinese|  444|
|           Black|  437|
|African American|  354|
|           Mixed|  334|
|           Welsh|  333|
|        Hispanic|  322|
+----------------+-----+



+ Most of people are **'White'** and **'Asian'**

### 4.1.3 Count by Gender

In [294]:
spark.sql("SELECT Gender,\
        COUNT(Gender) AS count\
        FROM salarydb\
        GROUP BY Gender\
        ORDER BY count DESC").show()

+------+-----+
|Gender|count|
+------+-----+
|  Male| 3671|
|Female| 3013|
| Other|   14|
+------+-----+



+ We have little data available on people who identify as **'Other'**

### 4.1.4 Education level vs Salary

In [295]:
spark.sql("SELECT \
    EducationLevel, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    EducationLevel \
ORDER BY 2 DESC" ).show()

+--------------+----------+
|EducationLevel|avg_salary|
+--------------+----------+
|           PhD| 165651.46|
|      Master's| 130112.06|
|    Bachelor's|  95082.91|
|   High School|  36706.69|
+--------------+----------+



+ The salary of a graduate with **PhD** is much higher than any other type of degree

+ People who only have a **High School** have the lowest average salary

## 4.2 Gender

### 4.2.1 Gender vs Average Salary

In [296]:
spark.sql("SELECT \
    Gender, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    Gender").show()

+------+----------+
|Gender|avg_salary|
+------+----------+
|Female|  107889.0|
| Other| 125869.86|
|  Male|  121395.7|
+------+----------+



+ Men get on average 14k more than women

### 4.2.2 Gender vs Average Salary by Education

In [297]:
spark.sql("SELECT \
    Gender, \
    EducationLevel, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    Gender, \
    EducationLevel \
ORDER BY EducationLevel DESC").show()

+------+--------------+----------+
|Gender|EducationLevel|avg_salary|
+------+--------------+----------+
|  Male|           PhD| 168711.07|
|Female|           PhD| 160266.29|
| Other|      Master's|  161393.0|
|Female|      Master's| 122694.54|
|  Male|      Master's| 140060.59|
| Other|   High School| 119949.33|
|  Male|   High School|  39380.75|
|Female|   High School|  30756.05|
|Female|    Bachelor's|  89165.22|
|  Male|    Bachelor's|  98971.77|
+------+--------------+----------+



+ Men and women with the same level of education, men end up receiving on average 10k more than women


## 4.3 Country

### 4.3.1 Average salary by Country

In [298]:
spark.sql("SELECT \
    Country, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    Country").show()

+---------+----------+
|  Country|avg_salary|
+---------+----------+
|    China| 116271.37|
|      USA| 112995.49|
|       UK|  115951.1|
|   Canada| 116556.73|
|Australia| 114917.38|
+---------+----------+



+ The average salary between country is similar

### 4.3.2 Average salary by Gender and Country 

In [299]:
spark.sql("SELECT \
    Gender, \
    Country, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    Gender, \
    Country \
ORDER BY Country DESC").show()

+------+---------+----------+
|Gender|  Country|avg_salary|
+------+---------+----------+
|  Male|      USA| 119683.12|
|Female|      USA| 104992.69|
| Other|      USA| 111517.33|
|  Male|       UK| 122206.37|
| Other|       UK|  129797.0|
|Female|       UK| 108456.69|
| Other|    China| 112516.75|
|  Male|    China| 120226.31|
|Female|    China| 111291.21|
|  Male|   Canada| 123982.06|
| Other|   Canada|  161347.0|
|Female|   Canada| 106884.71|
|  Male|Australia| 120896.76|
| Other|Australia|  104127.0|
|Female|Australia| 107913.57|
+------+---------+----------+



+ **USA**: Men get on average 15k more than women
+ **UK**: Men get on average 14k more than women
+ **China**: Men get on average 9k more than women
+ **Canada**: Men get on average 17k more than women
+ **Australia**: Men get on average 13k more than women


## 4.4 Job Title

### 4.4.1 Top 10 jobs with the highest salary

In [300]:
spark.sql("SELECT \
    JobTitle, \
    ROUND(AVG(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    JobTitle \
ORDER BY 2 DESC").show(10,False)  

+---------------------------+----------+
|JobTitle                   |avg_salary|
+---------------------------+----------+
|CEO                        |250000.0  |
|Chief Technology Officer   |250000.0  |
|Chief Data Officer         |220000.0  |
|Director of Data Science   |204561.4  |
|Director                   |200000.0  |
|VP of Finance              |200000.0  |
|VP of Operations           |190000.0  |
|Operations Director        |190000.0  |
|Director of Human Resources|187500.0  |
|Marketing Director         |183984.38 |
+---------------------------+----------+
only showing top 10 rows



### 4.4.2 Top 10 jobs with the most sum of salaries

In [301]:
spark.sql("SELECT \
    JobTitle, \
    COUNT(JobTitle) AS job_count, \
    ROUND(SUM(Salary), 2) AS avg_salary \
FROM \
    salarydb \
GROUP BY \
    JobTitle \
ORDER BY 3 DESC").show(10,False)  

+-------------------------+---------+-----------+
|JobTitle                 |job_count|avg_salary |
+-------------------------+---------+-----------+
|Data Scientist           |453      |7.5246E7   |
|Software Engineer Manager|376      |6.4860815E7|
|Software Engineer        |518      |5.866E7    |
|Senior Project Engineer  |318      |5.2859472E7|
|Data Analyst             |363      |4.5408E7   |
|Product Manager          |313      |4.4595E7   |
|Full Stack Engineer      |308      |3.9029711E7|
|Senior Software Engineer |244      |3.6923712E7|
|Marketing Manager        |255      |2.9877E7   |
|Back end Developer       |244      |2.4974025E7|
+-------------------------+---------+-----------+
only showing top 10 rows



## 5. Data Preprocessing

In [302]:
df.show(2)

+---+----+------+--------------+-----------------+----------+-------+-------+--------+
| ID| Age|Gender|EducationLevel|         JobTitle|Experience| Salary|Country|    Race|
+---+----+------+--------------+-----------------+----------+-------+-------+--------+
|  0|32.0|  Male|    Bachelor's|Software Engineer|       5.0|90000.0|     UK|   White|
|  1|28.0|Female|      Master's|     Data Analyst|       3.0|65000.0|    USA|Hispanic|
+---+----+------+--------------+-----------------+----------+-------+-------+--------+
only showing top 2 rows



To handle categorical data here, we will use the ```'StringIndexer,'``` which is a specific technique in Spark that works very similarly to the ```'label encoder'``` by transforming categorical attributes into integer numbers.

In [303]:
gender_indexer = StringIndexer(inputCol='Gender', outputCol='gender_index')
education_indexer = StringIndexer(inputCol='EducationLevel', outputCol='education_index')
job_indexer = StringIndexer(inputCol='JobTitle', outputCol='job_index')
country_indexer = StringIndexer(inputCol='Country', outputCol='country_index')
race_indexer = StringIndexer(inputCol='Race', outputCol='race_index')

Now, we place all independent values into a single column using ```'VectorAssembler'```

In [304]:
assembler = VectorAssembler(inputCols=['Age','gender_index', 'education_index','job_index','Experience','country_index','race_index'], outputCol='features')

# Create pipeline and fit it to the data
pipeline = Pipeline(stages=[gender_indexer, education_indexer,job_indexer,country_indexer,race_indexer, assembler])
df_indexed = pipeline.fit(df).transform(df)

In [305]:
df_indexed.select('features','Salary').show(2)

+--------------------+-------+
|            features| Salary|
+--------------------+-------+
|(7,[0,4,5],[32.0,...|90000.0|
|[28.0,1.0,1.0,3.0...|65000.0|
+--------------------+-------+
only showing top 2 rows



## 6. Splitting the dataset 

In [306]:
final_data = df_indexed.select('features','Salary')

In [307]:
train, test = final_data.randomSplit([0.7,0.3],seed=72)

## 7. Creating Model

### 7.1 Linear Regression

In [308]:
lr = LinearRegression(labelCol="Salary", featuresCol="features")

lrmodel = lr.fit(train)
predictions = lrmodel.transform(test)

predictions.select("prediction", "Salary", "features").show(5)

trainingSummary = lrmodel.summary

print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("R2: %f" % trainingSummary.r2)

+------------------+--------+--------------------+
|        prediction|  Salary|            features|
+------------------+--------+--------------------+
| 76631.72591443139| 50000.0|(7,[0,1,4],[23.0,...|
|104168.36435671756|160000.0|(7,[0,1,4],[28.0,...|
| 94187.98779814204|140000.0|(7,[0,1,4],[29.0,...|
|101416.17151404737| 90000.0|(7,[0,1,4],[30.0,...|
|  79251.5455068082| 90000.0|(7,[0,3,4],[23.0,...|
+------------------+--------+--------------------+
only showing top 5 rows

RMSE: 28372.594047
R2: 0.709830


### 7.2 Random forest regression

In [329]:
rf = RandomForestRegressor(labelCol="Salary", featuresCol="features",maxBins=200)

rfmodel = rf.fit(train)

predictions = rfmodel.transform(test)

predictions.select("prediction", "Salary", "features").show(5)

rmse_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="rmse")
r2_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")

rmse = rmse_evaluator.evaluate(predictions)
r2_score = r2_evaluator.evaluate(predictions)

print("R2 Score:", r2_score)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+--------+--------------------+
|        prediction|  Salary|            features|
+------------------+--------+--------------------+
| 67384.73926096143| 50000.0|(7,[0,1,4],[23.0,...|
| 123644.5311780095|160000.0|(7,[0,1,4],[28.0,...|
|104361.57855784424|140000.0|(7,[0,1,4],[29.0,...|
|120697.01064712256| 90000.0|(7,[0,1,4],[30.0,...|
| 85791.79776284129| 90000.0|(7,[0,3,4],[23.0,...|
+------------------+--------+--------------------+
only showing top 5 rows

R2 Score: 0.8779664048939058
Root Mean Squared Error (RMSE) on test data = 18528.1


### 7.3 Gradient-boosted tree regression

In [363]:
gbt = GBTRegressor(labelCol="Salary", featuresCol="features", maxBins=200, maxIter=10)

gbtmodel = gbt.fit(train)

predictions_train = gbtmodel.transform(train)
predictions_test = gbtmodel.transform(test)

predictions_train.select("prediction", "Salary", "features").show(5)
predictions_test.select("prediction", "Salary", "features").show(5)

rmse_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="rmse")

rmse_train = rmse_evaluator.evaluate(predictions_train)
rmse_test = rmse_evaluator.evaluate(predictions_test)

r2_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")

r2_score_train = r2_evaluator.evaluate(predictions_train)
r2_score_test = r2_evaluator.evaluate(predictions_test)

print("Root Mean Squared Error (RMSE) on train data =", rmse_train)
print("Root Mean Squared Error (RMSE) on test data =", rmse_test)

print("R2 Score on train data:", r2_score_train)
print("R2 Score on test data:", r2_score_test)

+------------------+--------+--------------------+
|        prediction|  Salary|            features|
+------------------+--------+--------------------+
| 73924.70057708598| 70000.0|(7,[0,1,4],[27.0,...|
| 139346.9476671564|160000.0|(7,[0,1,4],[28.0,...|
|103551.30855690011| 90000.0|(7,[0,1,4],[30.0,...|
|103551.30855690011| 90000.0|(7,[0,1,4],[30.0,...|
|103551.30855690011| 90000.0|(7,[0,1,4],[30.0,...|
+------------------+--------+--------------------+
only showing top 5 rows

+------------------+--------+--------------------+
|        prediction|  Salary|            features|
+------------------+--------+--------------------+
| 63639.62407528602| 50000.0|(7,[0,1,4],[23.0,...|
| 139346.9476671564|160000.0|(7,[0,1,4],[28.0,...|
|100239.88299555617|140000.0|(7,[0,1,4],[29.0,...|
|103551.30855690011| 90000.0|(7,[0,1,4],[30.0,...|
| 100771.1397294436| 90000.0|(7,[0,3,4],[23.0,...|
+------------------+--------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) o

Analyzing the metrics of the three algorithms we tested, the Gradient-Boosted Tree Regression performed the best. Therefore, we will proceed with fine-tuning this model to further optimize its performance.

## 8. Tunning the model

In [362]:
gbt = GBTRegressor(featuresCol="features", labelCol="Salary")

param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxBins, [200]) \
    .addGrid(gbt.maxIter, [10,20,50]) \
    .addGrid(gbt.subsamplingRate, [0.8, 1.0])  \
    .addGrid(gbt.minInstancesPerNode, [1, 5, 10]) \
    .addGrid(gbt.minInfoGain, [0.0, 0.1, 0.2]) \
    .build()

evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")

cv = CrossValidator(estimator=gbt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

cv_model = cv.fit(train)

best_model = cv_model.bestModel

In [364]:
predictions_train = best_model.transform(train)
predictions_test = best_model.transform(test)

predictions_train.select("prediction", "Salary", "features").show(5)
predictions_test.select("prediction", "Salary", "features").show(5)

rmse_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="rmse")

rmse_train = rmse_evaluator.evaluate(predictions_train)
rmse_test = rmse_evaluator.evaluate(predictions_test)

print("Root Mean Squared Error (RMSE) on train data =", rmse_train)
print("Root Mean Squared Error (RMSE) on test data =", rmse_test)

r2_evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")

r2_score_train = r2_evaluator.evaluate(predictions_train)
r2_score_test = r2_evaluator.evaluate(predictions_test)

print("R2 Score on train data:", r2_score_train)
print("R2 Score on test data:", r2_score_test)

+-----------------+--------+--------------------+
|       prediction|  Salary|            features|
+-----------------+--------+--------------------+
|74904.08510848839| 70000.0|(7,[0,1,4],[27.0,...|
|146028.1051142685|160000.0|(7,[0,1,4],[28.0,...|
|97965.75922119537| 90000.0|(7,[0,1,4],[30.0,...|
|97965.75922119537| 90000.0|(7,[0,1,4],[30.0,...|
|97965.75922119537| 90000.0|(7,[0,1,4],[30.0,...|
+-----------------+--------+--------------------+
only showing top 5 rows

+-----------------+--------+--------------------+
|       prediction|  Salary|            features|
+-----------------+--------+--------------------+
|54994.82676905389| 50000.0|(7,[0,1,4],[23.0,...|
|146028.1051142685|160000.0|(7,[0,1,4],[28.0,...|
| 95501.9932511042|140000.0|(7,[0,1,4],[29.0,...|
|97965.75922119537| 90000.0|(7,[0,1,4],[30.0,...|
|89405.65935632093| 90000.0|(7,[0,3,4],[23.0,...|
+-----------------+--------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on train data = 873

## 9. Testing Tuned model in unseen data

In [None]:
unseen_df = spark.read.csv("data/unseendata.csv", header=True, inferSchema=True)
unseen_df.show()

+---+-------+--------------+-------------------+----------+----------+-----------------+
|Age| Gender|EducationLevel|           JobTitle|Experience|   Country|             Race|
+---+-------+--------------+-------------------+----------+----------+-----------------+
| 25| Female|      Master's|     Data Scientist|       3.0|    Canada|            Asian|
| 40|   Male|           PhD|   Senior Scientist|      10.0|        UK|            Black|
| 35| Female|    Bachelor's|  Software Engineer|       7.0|     China|          Chinese|
| 33|  Other|   High School| Software Developer|       8.0|       USA| African American|
| 45|   Male|    Bachelor's|      Sales Manager|      12.0|        UK|            Mixed|
| 29|  Other|      Master's|      Data Engineer|       6.0| Australia|           Korean|
+---+-------+--------------+-------------------+----------+----------+-----------------+



In [343]:
unseen_indexed = pipeline.fit(unseen).transform(unseen)

In [344]:
unseen_final = unseen_indexed.select('features')

In [365]:
predictions = best_model.transform(unseen_final)

predictions.select("prediction","features").show(6,False)

+------------------+-------------------------------+
|prediction        |features                       |
+------------------+-------------------------------+
|116548.54312909255|[25.0,0.0,1.0,1.0,3.0,2.0,1.0] |
|114261.16186101131|[40.0,1.0,3.0,3.0,10.0,0.0,2.0]|
|122497.75515801723|[35.0,0.0,0.0,5.0,7.0,3.0,3.0] |
|127011.24425217317|[33.0,2.0,2.0,4.0,8.0,4.0,0.0] |
|167825.64581715153|[45.0,1.0,0.0,2.0,12.0,0.0,5.0]|
|170877.09535562355|[29.0,2.0,1.0,0.0,6.0,1.0,4.0] |
+------------------+-------------------------------+



### 10. Stop Spark Session

In [366]:
spark.stop()