In [1]:
# Install pyspark
!pip install pyspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

# Import a Spark function from library
from pyspark.sql.functions import col

!pip install findspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=2b06983c2b1d65ee61a0f22a7edfe9f5e66e704bf57a07f91bb7f6903f7c4aea
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 k

You are given housing prices for a given market (WestRoxbury.csv Download WestRoxbury.csv)
Create a Jupyter notebook that analyzes this data using PySpark. Load the data into a pyspark data frame and conduct any necessary datatyping/casting. In the notebook, you must answer the following questions using code. 
1) Identify the top 10 most expensive homes.
2) Does remodeling a home indicate higher prices (simply compare the average between recently remodeled homes versus those that were not).
3) Create a linear regression model that predicts home prices using LOT SQFT, YR Built, Gross Area, Living Area, and REMODEL variables. 
4) Using your predictive model, determine the price of a home with the following:
LOT SQRT: 7500
YR Built: 1990
Gross Area: 2100
Living Area: 1900
Remodel: Recent


In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession;

spark = SparkSession.builder.config("spark.driver.host","localhost").master("local[4]").appName("ISM6562 Spark Assignment App").getOrCreate();

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

Spark Session WebUI Port: 4040


In [3]:
# this will set the log level to ERROR. This will hide the INFO or WARNING messages that are printed out by default. If you want to see them, set this to INFO or WARN.
sc.setLogLevel("ERROR") 

In [4]:
spark

## Loading our data into spark dataframe. 

In [5]:
# Load CSV file
df_spark = spark.read.csv("sample_data/WestRoxbury.csv", header=True, inferSchema=True)
df_spark.show()

+------------+----+---------+--------+-----------+-----------+-------+-----+---------+---------+---------+-------+---------+-------+
|TOTAL VALUE | TAX|LOT SQFT |YR BUILT|GROSS AREA |LIVING AREA|FLOORS |ROOMS|BEDROOMS |FULL BATH|HALF BATH|KITCHEN|FIREPLACE|REMODEL|
+------------+----+---------+--------+-----------+-----------+-------+-----+---------+---------+---------+-------+---------+-------+
|       344.2|4330|     9965|    1880|       2436|       1352|    2.0|    6|        3|        1|        1|      1|        0|   None|
|       412.6|5190|     6590|    1945|       3108|       1976|    2.0|   10|        4|        2|        1|      1|        0| Recent|
|       330.1|4152|     7500|    1890|       2294|       1371|    2.0|    8|        4|        1|        1|      1|        0|   None|
|       498.6|6272|    13773|    1957|       5032|       2608|    1.0|    9|        5|        1|        1|      1|        1|   None|
|       331.5|4170|     5000|    1910|       2370|       1438|    2.0

## Rename Column 

In [6]:
df_renamed = df_spark.withColumnRenamed("TOTAL VALUE ","total_value").withColumnRenamed("TAX","tax").withColumnRenamed("LOT SQFT ","lot_sqft").withColumnRenamed("YR BUILT","yr_built").withColumnRenamed("GROSS AREA ","gross_area").withColumnRenamed("LIVING AREA","living_area").withColumnRenamed("FLOORS ","floors").withColumnRenamed("ROOMS","rooms").withColumnRenamed("BEDROOMS ","bedrooms").withColumnRenamed("FULL BATH","full_bath").withColumnRenamed("HALF BATH","half_bath").withColumnRenamed("KITCHEN","kitchen").withColumnRenamed("FIREPLACE","fireplace").withColumnRenamed("REMODEL","remodel")

df_renamed.show()

+-----------+----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|total_value| tax|lot_sqft|yr_built|gross_area|living_area|floors|rooms|bedrooms|full_bath|half_bath|kitchen|fireplace|remodel|
+-----------+----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|      344.2|4330|    9965|    1880|      2436|       1352|   2.0|    6|       3|        1|        1|      1|        0|   None|
|      412.6|5190|    6590|    1945|      3108|       1976|   2.0|   10|       4|        2|        1|      1|        0| Recent|
|      330.1|4152|    7500|    1890|      2294|       1371|   2.0|    8|       4|        1|        1|      1|        0|   None|
|      498.6|6272|   13773|    1957|      5032|       2608|   1.0|    9|       5|        1|        1|      1|        1|   None|
|      331.5|4170|    5000|    1910|      2370|       1438|   2.0|    7|       3|        2|        0|   

# Data Exploration and Transformations 

In [7]:
df_renamed.printSchema()

root
 |-- total_value: double (nullable = true)
 |-- tax: integer (nullable = true)
 |-- lot_sqft: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- gross_area: integer (nullable = true)
 |-- living_area: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- full_bath: integer (nullable = true)
 |-- half_bath: integer (nullable = true)
 |-- kitchen: integer (nullable = true)
 |-- fireplace: integer (nullable = true)
 |-- remodel: string (nullable = true)



# Finding missing values

In [8]:
from pyspark.sql.functions import col

# Find columns with missing values
columns_with_missing_values = [column for column in df_renamed.columns if df_renamed.filter(col(column).isNull()).count() > 0]

# Print columns with missing values
print("Columns with missing values:")
for column in columns_with_missing_values:
    print(column)


Columns with missing values:


# Imputing missing values

In [None]:
#from pyspark.sql.functions import col

# Group by the column and apply the count() function
#count_df = df_renamed.groupBy("TypeofContact").count()

# Show the resulting counts
#count_df.show()

In [None]:
#df_spark = df_renamed.fillna("Self Enquiry", subset=["TypeofContact"])

Imputing with missing value with median value for numeric values

In [None]:
# #from pyspark.sql.functions import col
# from pyspark.sql.functions import percentile_approx
# from pyspark.sql.functions import when

# # Iterate over columns with missing values
# for column in columns_with_missing_values:
#     # Calculate median of the column
#     median_value = df_spark.select(column).agg(percentile_approx(column, 0.5)).collect()[0][0]
#     if median_value is not None:
#     # Round median_value to nearest integer
#         median_value_rounded = int(round(median_value))
#     else:
#         median_value_rounded = 0
    
#     # Impute missing values with median value
#     df_spark = df_spark.withColumn(column, when(col(column).isNull(), median_value_rounded).otherwise(col(column)))

In [None]:
# # verifying if all missing values were imputed
# columns_with_missing_values = [column for column in df_spark.columns if df_spark.filter(col(column).isNull()).count() > 0]

# # Print columns with missing values
# print("Columns with missing values:")
# for column in columns_with_missing_values:
#     print(column)

In [None]:
# # Save imputed data with original header

# df_spark.write.option("header", True).csv("/content/Big Data Output/processed_travel_withheader.csv")

In [None]:
# df_spark.show(20)

# Storing the data into a persistent table and creating a temp view of the data


In [12]:
# Create a database
spark.sql("CREATE DATABASE IF NOT EXISTS HOUSING")

# Use the database
spark.sql("USE HOUSING")

DataFrame[]

In [11]:
#spark.sql("DROP DATABASE HOUSING")

DataFrame[]

In [None]:
# To drop the Table
#spark.sql("DROP TABLE house_sale")

In [13]:
# Store the data into a persistent table in the Travel db
df_renamed.write.saveAsTable("house_sales")

In [14]:
# Create a temporary view of the data
df_renamed.createOrReplaceTempView("housing_view")

In [15]:
# Verify if the table exists in the created database
check = spark.sql("SHOW TABLES")
if check.filter(check.tableName == "housing_view").count() > 0:
    print("Table exists in the created database.")

Table exists in the created database.


# Now let us find some insights by using the aggregation.

<p style="color:brown;">1) Identify the top 10 most expensive homes. </p>


In [16]:
top_exp_homes = spark.sql("SELECT * FROM housing_view order by total_value desc limit 10").show()

+-----------+-----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|total_value|  tax|lot_sqft|yr_built|gross_area|living_area|floors|rooms|bedrooms|full_bath|half_bath|kitchen|fireplace|remodel|
+-----------+-----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|     1217.8|15319|   46411|    1940|      7144|       4037|   2.0|    8|       5|        3|        1|      1|        3|   None|
|     1009.9|12704|   13300|    2010|      7624|       4133|   2.0|    9|       4|        3|        1|      1|        2|   None|
|      996.9|12541|   10050|    2006|      5392|       4375|   2.0|   11|       4|        4|        0|      1|        1|   None|
|      936.0|11774|   20560|    1901|      8154|       4106|   2.0|    9|       4|        3|        1|      1|        2|   None|
|      935.1|11763|   25200|    1954|      6840|       5289|   1.0|   13|       9|        2|     


<p style="color:brown;">2) Does remodeling a home indicate higher prices (simply compare the average between recently remodeled homes versus those that were not).</p>


In [17]:
total_result = spark.sql("""
    select remodel,avg(total_value) from housing_view group by remodel
""").show()

+-------+------------------+
|remodel|  avg(total_value)|
+-------+------------------+
|   None| 380.1891396686604|
|    Old|405.08852839931154|
| Recent| 446.5189508571427|
+-------+------------------+



3) Create a linear regression model that predicts home prices using LOT SQFT, YR Built, Gross Area, Living Area, and REMODEL variables. 

Let's select a the subset of columns we are interested in for our analysis.

In [18]:
df_data =df_renamed.select([
    'lot_sqft',
    'yr_built',
    'gross_area',
    'living_area',
    'remodel',
    'total_value'
    ]
)

In [19]:
df_data.count()

5802

In [20]:
#Remove any rows from the DataFrame that contain missing values
df_data = df_data.dropna()


In [21]:
df_data.count()

5802

In [22]:
df_data.show()

+--------+--------+----------+-----------+-------+-----------+
|lot_sqft|yr_built|gross_area|living_area|remodel|total_value|
+--------+--------+----------+-----------+-------+-----------+
|    9965|    1880|      2436|       1352|   None|      344.2|
|    6590|    1945|      3108|       1976| Recent|      412.6|
|    7500|    1890|      2294|       1371|   None|      330.1|
|   13773|    1957|      5032|       2608|   None|      498.6|
|    5000|    1910|      2370|       1438|   None|      331.5|
|    5142|    1950|      2124|       1060|    Old|      337.4|
|    5000|    1954|      3220|       1916|   None|      359.4|
|   10000|    1950|      2208|       1200|   None|      320.4|
|    6835|    1958|      2582|       1092| Recent|      333.5|
|    5093|    1900|      4818|       2992|   None|      409.4|
|    5000|    1960|      2624|       1485|   None|      313.0|
|    6768|    1958|      2844|       1460|   None|      344.5|
|    5000|    1889|      2196|       1290|   None|     

#Model Training

We earlier created the incident table with 'inferSchema' set to true. This means that the schema of the table is inferred from the data. The knowledge column was properly inferred as a boolean, however, in the pipeline below with use StringIndexer on this column - and therefore, need to convert this column data type to a string.

In [23]:
df_data.printSchema()

root
 |-- lot_sqft: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- gross_area: integer (nullable = true)
 |-- living_area: integer (nullable = true)
 |-- remodel: string (nullable = true)
 |-- total_value: double (nullable = true)



CONVERTING BOOLEAN DATA TO NUMERICAL


In [None]:
#from pyspark.sql.types import IntegerType,BooleanType,DateType, StringType

#df_closed_incidents = df_data.withColumn("knowledge",df_data.knowledge.cast(StringType()))

In [None]:
#df_closed_incidents.printSchema()

Now that we have our data ready, let's do a train test split (70/30).

In [24]:
train_data,test_data=df_data.randomSplit([0.7,0.3])

CONVERTING STRING TO NUMERICAL

In [25]:
from pyspark.ml.feature import StringIndexer
# Use StringIndexer to convert the categorical columns to hold numerical data
 
remodel_indexer = StringIndexer(inputCol='remodel',outputCol='remodel_indexer',handleInvalid='keep')


In [26]:
from pyspark.ml.feature import VectorAssembler
# Vector assembler is used to create a vector of input features
 
assembler = VectorAssembler(
    inputCols=[
        "remodel_indexer",
        "lot_sqft",
        "yr_built",
        "gross_area",
        "living_area"   
    ],
    outputCol="features"
)

In [27]:
from pyspark.ml import Pipeline

# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data
# https://spark.apache.org/docs/latest/ml-pipeline.html
 
pipe = Pipeline(stages=[
    remodel_indexer,
    assembler
    ]
)

In [28]:
fitted_pipe=pipe.fit(train_data)

In [29]:
train_data=fitted_pipe.transform(train_data)
train_data.show()

+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+
|lot_sqft|yr_built|gross_area|living_area|remodel|total_value|remodel_indexer|            features|
+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+
|    1017|    1920|      1293|        797|   None|      144.6|            0.0|[0.0,1017.0,1920....|
|    1037|    1920|      2520|       1680|   None|      176.9|            0.0|[0.0,1037.0,1920....|
|    1218|    1920|      1965|        971|   None|      204.7|            0.0|[0.0,1218.0,1920....|
|    1358|    1904|      2576|       1598|   None|      244.2|            0.0|[0.0,1358.0,1904....|
|    1616|    1920|      1439|        856|   None|      182.7|            0.0|[0.0,1616.0,1920....|
|    1696|    1904|      2496|       1518|   None|      213.4|            0.0|[0.0,1696.0,1904....|
|    1800|    1942|      3663|       1904|   None|      301.2|            0.0|[0.0,1800.0,1942....|


In [30]:
test_data=fitted_pipe.transform(test_data)
test_data.show()

+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+
|lot_sqft|yr_built|gross_area|living_area|remodel|total_value|remodel_indexer|            features|
+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+
|     997|    1910|      1319|        504|   None|      105.0|            0.0|[0.0,997.0,1910.0...|
|    1237|    1990|      3039|       1908|   None|      295.6|            0.0|[0.0,1237.0,1990....|
|    1980|    1900|      1502|        690|   None|      167.6|            0.0|[0.0,1980.0,1900....|
|    2100|    1900|      1800|        972| Recent|      262.1|            1.0|[1.0,2100.0,1900....|
|    2100|    1900|      2414|       1544| Recent|      323.0|            1.0|[1.0,2100.0,1900....|
|    2126|    1845|      1719|        958|   None|      276.6|            0.0|[0.0,2126.0,1845....|
|    2174|    1920|      2952|       1382|   None|      203.7|            0.0|[0.0,2174.0,1920....|


In [31]:
# # For those interested in utilizing the ML/AI power of Tensorflow with Spark....
# # https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-distributor

# # In this course, we'll use the SparkML (admitedely, it's not as powerful as Tensorflow, but 
# # it's easy to use and demonstrate ML on a Spark Cluster)

from pyspark.ml.regression import LinearRegression

lr_model = LinearRegression(labelCol='total_value')
fit_model = lr_model.fit(train_data.select(['features','total_value']))


In [32]:
results = fit_model.transform(test_data)
results.show()

+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+------------------+
|lot_sqft|yr_built|gross_area|living_area|remodel|total_value|remodel_indexer|            features|        prediction|
+--------+--------+----------+-----------+-------+-----------+---------------+--------------------+------------------+
|     997|    1910|      1319|        504|   None|      105.0|            0.0|[0.0,997.0,1910.0...| 187.1347393488042|
|    1237|    1990|      3039|       1908|   None|      295.6|            0.0|[0.0,1237.0,1990....| 382.0812333687375|
|    1980|    1900|      1502|        690|   None|      167.6|            0.0|[0.0,1980.0,1900....|218.65390159994482|
|    2100|    1900|      1800|        972| Recent|      262.1|            1.0|[1.0,2100.0,1900....|265.54817366238495|
|    2100|    1900|      2414|       1544| Recent|      323.0|            1.0|[1.0,2100.0,1900....| 340.5696575246353|
|    2126|    1845|      1719|        958|   Non

In [33]:
results.select(['total_value','prediction']).show()

+-----------+------------------+
|total_value|        prediction|
+-----------+------------------+
|      105.0| 187.1347393488042|
|      295.6| 382.0812333687375|
|      167.6|218.65390159994482|
|      262.1|265.54817366238495|
|      323.0| 340.5696575246353|
|      276.6| 250.3802202491721|
|      203.7|325.82902243401986|
|      275.8| 306.3674458486202|
|      256.4| 349.0979734565778|
|      422.1| 397.3975293667312|
|      395.2|381.76870358446655|
|      391.4| 371.4812674177571|
|      364.9| 419.1469344647098|
|      215.5| 283.2430152704797|
|      266.3|357.76836166431997|
|      272.9| 330.6721043279091|
|      230.6|260.73334460121504|
|      291.7|325.00484421791236|
|      284.3|327.55021353842784|
|      261.6|  284.428689073596|
+-----------+------------------+
only showing top 20 rows



Evaluate the peformance of the Linear Regression Model

In [34]:
test_results = fit_model.evaluate(test_data)

In [35]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  -82.1347393488042|
| -86.48123336873749|
| -51.05390159994482|
|-3.4481736623849315|
|-17.569657524635318|
|  26.21977975082791|
|-122.12902243401987|
|-30.567445848620196|
| -92.69797345657781|
|  24.70247063326883|
| 13.431296415533438|
| 19.918732582242853|
|-54.246934464709796|
|  -67.7430152704797|
| -91.46836166431996|
| -57.77210432790912|
|-30.133344601215043|
|-33.304844217912375|
| -43.25021353842783|
|-22.828689073595967|
+-------------------+
only showing top 20 rows



In [36]:
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'Ex Var:':7s} {test_results.explainedVariance:>7.3f}")
print(f"{'MAE:':7s} {test_results.meanAbsoluteError:>7.3f}")
print(f"{'MSE:':7s} {test_results.meanSquaredError:>7.3f}")
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'R2:':7s} {test_results.r2:>7.3f}")

RMSE:    48.705
Ex Var: 7569.807
MAE:     37.179
MSE:    2372.141
RMSE:    48.705
R2:       0.734


4) Using your predictive model, determine the price of a home with the following:
LOT SQRT: 7500
YR Built: 1990
Gross Area: 2100
Living Area: 1900
Remodel: Recent


In [43]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

from pyspark.ml.linalg import Vectors

# Create a test dataframe with the given features

prediction_data = [(7500, 1990, 2100, 1900, 1)]

test_df = spark.createDataFrame(prediction_data, ['lot_sqft', 'yr_built', 'gross_area', 'living_area', 'remodel_indexer'])

In [44]:
house_df = assembler.transform(test_df)

In [45]:
new_house_features =Vectors.dense(house_df.select('features').collect()[0][0])
predicted_price = fit_model.predict(new_house_features)


In [46]:
print(f'price is  {predicted_price:.2f} dollars ' )

price is  423.30 dollars 
