In [1]:
# Install pyspark
!pip install pyspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

# Import a Spark function from library
from pyspark.sql.functions import col

!pip install findspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=064a2110d18731038fe8608daa1a10a9dea857ca5d84c747387d6877b710466b
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 k

You are given housing prices for a given market (WestRoxbury.csv Download WestRoxbury.csv)
Create a Jupyter notebook that analyzes this data using PySpark. Load the data into a pyspark data frame and conduct any necessary datatyping/casting. In the notebook, you must answer the following questions using code. 
1) Identify the top 10 most expensive homes.
2) Does remodeling a home indicate higher prices (simply compare the average between recently remodeled homes versus those that were not).
3) Create a linear regression model that predicts home prices using LOT SQFT, YR Built, Gross Area, Living Area, and REMODEL variables. 
4) Using your predictive model, determine the price of a home with the following:
LOT SQRT: 7500
YR Built: 1990
Gross Area: 2100
Living Area: 1900
Remodel: Recent


In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession;

spark = SparkSession.builder.config("spark.driver.host","localhost").master("local[4]").appName("ISM6562 Spark Assignment App").getOrCreate();

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

Spark Session WebUI Port: 4040


In [3]:
# this will set the log level to ERROR. This will hide the INFO or WARNING messages that are printed out by default. If you want to see them, set this to INFO or WARN.
sc.setLogLevel("ERROR") 

In [4]:
spark

## Loading our data into spark dataframe. 

In [6]:
# Load CSV file
df_spark = spark.read.csv("sample_data/WestRoxbury.csv", header=True, inferSchema=True)
df_spark.show()

+------------+----+---------+--------+-----------+-----------+-------+-----+---------+---------+---------+-------+---------+-------+
|TOTAL VALUE | TAX|LOT SQFT |YR BUILT|GROSS AREA |LIVING AREA|FLOORS |ROOMS|BEDROOMS |FULL BATH|HALF BATH|KITCHEN|FIREPLACE|REMODEL|
+------------+----+---------+--------+-----------+-----------+-------+-----+---------+---------+---------+-------+---------+-------+
|       344.2|4330|     9965|    1880|       2436|       1352|    2.0|    6|        3|        1|        1|      1|        0|   None|
|       412.6|5190|     6590|    1945|       3108|       1976|    2.0|   10|        4|        2|        1|      1|        0| Recent|
|       330.1|4152|     7500|    1890|       2294|       1371|    2.0|    8|        4|        1|        1|      1|        0|   None|
|       498.6|6272|    13773|    1957|       5032|       2608|    1.0|    9|        5|        1|        1|      1|        1|   None|
|       331.5|4170|     5000|    1910|       2370|       1438|    2.0

## Rename Column 

In [7]:
df_renamed = df_spark.withColumnRenamed("TOTAL VALUE ","total_value").withColumnRenamed("TAX","tax").withColumnRenamed("LOT SQFT ","lot_sqft").withColumnRenamed("YR BUILT","yr_built").withColumnRenamed("GROSS AREA ","gross_area").withColumnRenamed("LIVING AREA","living_area").withColumnRenamed("FLOORS ","floors").withColumnRenamed("ROOMS","rooms").withColumnRenamed("BEDROOMS ","bedrooms").withColumnRenamed("FULL BATH","full_bath").withColumnRenamed("HALF BATH","half_bath").withColumnRenamed("KITCHEN","kitchen").withColumnRenamed("FIREPLACE","fireplace").withColumnRenamed("REMODEL","remodel")

df_renamed.show()

+-----------+----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|total_value| tax|lot_sqft|yr_built|gross_area|living_area|floors|rooms|bedrooms|full_bath|half_bath|kitchen|fireplace|remodel|
+-----------+----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|      344.2|4330|    9965|    1880|      2436|       1352|   2.0|    6|       3|        1|        1|      1|        0|   None|
|      412.6|5190|    6590|    1945|      3108|       1976|   2.0|   10|       4|        2|        1|      1|        0| Recent|
|      330.1|4152|    7500|    1890|      2294|       1371|   2.0|    8|       4|        1|        1|      1|        0|   None|
|      498.6|6272|   13773|    1957|      5032|       2608|   1.0|    9|       5|        1|        1|      1|        1|   None|
|      331.5|4170|    5000|    1910|      2370|       1438|   2.0|    7|       3|        2|        0|   

# Data Exploration and Transformations 

In [8]:
df_renamed.printSchema()

root
 |-- total_value: double (nullable = true)
 |-- tax: integer (nullable = true)
 |-- lot_sqft: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- gross_area: integer (nullable = true)
 |-- living_area: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- full_bath: integer (nullable = true)
 |-- half_bath: integer (nullable = true)
 |-- kitchen: integer (nullable = true)
 |-- fireplace: integer (nullable = true)
 |-- remodel: string (nullable = true)



# Finding missing values

In [9]:
from pyspark.sql.functions import col

# Find columns with missing values
columns_with_missing_values = [column for column in df_renamed.columns if df_renamed.filter(col(column).isNull()).count() > 0]

# Print columns with missing values
print("Columns with missing values:")
for column in columns_with_missing_values:
    print(column)


Columns with missing values:


# Imputing missing values

In [None]:
#from pyspark.sql.functions import col

# Group by the column and apply the count() function
#count_df = df_renamed.groupBy("TypeofContact").count()

# Show the resulting counts
#count_df.show()

In [None]:
#df_spark = df_renamed.fillna("Self Enquiry", subset=["TypeofContact"])

Imputing with missing value with median value for numeric values

In [None]:
# #from pyspark.sql.functions import col
# from pyspark.sql.functions import percentile_approx
# from pyspark.sql.functions import when

# # Iterate over columns with missing values
# for column in columns_with_missing_values:
#     # Calculate median of the column
#     median_value = df_spark.select(column).agg(percentile_approx(column, 0.5)).collect()[0][0]
#     if median_value is not None:
#     # Round median_value to nearest integer
#         median_value_rounded = int(round(median_value))
#     else:
#         median_value_rounded = 0
    
#     # Impute missing values with median value
#     df_spark = df_spark.withColumn(column, when(col(column).isNull(), median_value_rounded).otherwise(col(column)))

In [None]:
# # verifying if all missing values were imputed
# columns_with_missing_values = [column for column in df_spark.columns if df_spark.filter(col(column).isNull()).count() > 0]

# # Print columns with missing values
# print("Columns with missing values:")
# for column in columns_with_missing_values:
#     print(column)

In [None]:
# # Save imputed data with original header

# df_spark.write.option("header", True).csv("/content/Big Data Output/processed_travel_withheader.csv")

In [None]:
# df_spark.show(20)

# Storing the data into a persistent table and creating a temp view of the data


In [12]:
# Create a database
spark.sql("CREATE DATABASE IF NOT EXISTS HOUSING")

# Use the database
spark.sql("USE HOUSING")

DataFrame[]

In [11]:
#spark.sql("DROP DATABASE HOUSING")

DataFrame[]

In [13]:
# To drop the Table
#spark.sql("DROP TABLE house_sale")

In [14]:
# Store the data into a persistent table in the Travel db
df_renamed.write.saveAsTable("house_sales")

In [15]:
# Create a temporary view of the data
df_renamed.createOrReplaceTempView("housing_view")

In [16]:
# Verify if the table exists in the created database
check = spark.sql("SHOW TABLES")
if check.filter(check.tableName == "housing_view").count() > 0:
    print("Table exists in the created database.")

Table exists in the created database.


# Now let us find some insights by using the aggregation.

<p style="color:brown;">1) Identify the top 10 most expensive homes. </p>


In [17]:
top_exp_homes = spark.sql("SELECT * FROM housing_view order by total_value desc limit 10").show()

+-----------+-----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|total_value|  tax|lot_sqft|yr_built|gross_area|living_area|floors|rooms|bedrooms|full_bath|half_bath|kitchen|fireplace|remodel|
+-----------+-----+--------+--------+----------+-----------+------+-----+--------+---------+---------+-------+---------+-------+
|     1217.8|15319|   46411|    1940|      7144|       4037|   2.0|    8|       5|        3|        1|      1|        3|   None|
|     1009.9|12704|   13300|    2010|      7624|       4133|   2.0|    9|       4|        3|        1|      1|        2|   None|
|      996.9|12541|   10050|    2006|      5392|       4375|   2.0|   11|       4|        4|        0|      1|        1|   None|
|      936.0|11774|   20560|    1901|      8154|       4106|   2.0|    9|       4|        3|        1|      1|        2|   None|
|      935.1|11763|   25200|    1954|      6840|       5289|   1.0|   13|       9|        2|     


<p style="color:brown;">2) Does remodeling a home indicate higher prices (simply compare the average between recently remodeled homes versus those that were not).</p>


In [18]:
total_result = spark.sql("""
    select remodel,avg(total_value) from housing_view group by remodel
""").show()

+-------+------------------+
|remodel|  avg(total_value)|
+-------+------------------+
|   None| 380.1891396686604|
|    Old|405.08852839931154|
| Recent| 446.5189508571427|
+-------+------------------+



3) Create a linear regression model that predicts home prices using LOT SQFT, YR Built, Gross Area, Living Area, and REMODEL variables. 

Let's select a the subset of columns we are interested in for our analysis.

In [19]:
df_data =df_renamed.select([
    'lot_sqft',
    'yr_built',
    'gross_area',
    'living_area',
    'remodel',
    'total_value'
    ]
)

In [20]:
df_data.count()

5802

In [21]:
#Remove any rows from the DataFrame that contain missing values
df_data = df_data.dropna()


In [22]:
df_data.count()

5802

In [23]:
df_data.show()

+--------+--------+----------+-----------+-------+-----------+
|lot_sqft|yr_built|gross_area|living_area|remodel|total_value|
+--------+--------+----------+-----------+-------+-----------+
|    9965|    1880|      2436|       1352|   None|      344.2|
|    6590|    1945|      3108|       1976| Recent|      412.6|
|    7500|    1890|      2294|       1371|   None|      330.1|
|   13773|    1957|      5032|       2608|   None|      498.6|
|    5000|    1910|      2370|       1438|   None|      331.5|
|    5142|    1950|      2124|       1060|    Old|      337.4|
|    5000|    1954|      3220|       1916|   None|      359.4|
|   10000|    1950|      2208|       1200|   None|      320.4|
|    6835|    1958|      2582|       1092| Recent|      333.5|
|    5093|    1900|      4818|       2992|   None|      409.4|
|    5000|    1960|      2624|       1485|   None|      313.0|
|    6768|    1958|      2844|       1460|   None|      344.5|
|    5000|    1889|      2196|       1290|   None|     

## Building Logistic Model

#Model Training

We earlier created the incident table with 'inferSchema' set to true. This means that the schema of the table is inferred from the data. The knowledge column was properly inferred as a boolean, however, in the pipeline below with use StringIndexer on this column - and therefore, need to convert this column data type to a string.

In [24]:
df_data.printSchema()

root
 |-- lot_sqft: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- gross_area: integer (nullable = true)
 |-- living_area: integer (nullable = true)
 |-- remodel: string (nullable = true)
 |-- total_value: double (nullable = true)



CONVERTING BOOLEAN DATA TO NUMERICAL


In [None]:
#from pyspark.sql.types import IntegerType,BooleanType,DateType, StringType

#df_closed_incidents = df_data.withColumn("knowledge",df_data.knowledge.cast(StringType()))

In [None]:
#df_closed_incidents.printSchema()

CONVERTING STRINGS TO INTEGER (IF TARGET VARIABLE OF LOGISTIC REGRESSION
                               )

In [25]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, StringType

df_data=df_data.withColumn('total_value',df_data.total_value.cast('integer'))

In [26]:
df_data.printSchema()

root
 |-- lot_sqft: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- gross_area: integer (nullable = true)
 |-- living_area: integer (nullable = true)
 |-- remodel: string (nullable = true)
 |-- total_value: integer (nullable = true)



Now that we have our data ready, let's do a train test split (70/30).

In [27]:
train_data,test_data=df_data.randomSplit([0.7,0.3])

CONVERTING STRING TO NUMERICAL

In [28]:
from pyspark.ml.feature import StringIndexer
# Use StringIndexer to convert the categorical columns to hold numerical data
 
remodel_indexer = StringIndexer(inputCol='remodel',outputCol='remodel_index',handleInvalid='keep')


In [29]:
from pyspark.ml.feature import OneHotEncoder
data_encoder = OneHotEncoder(
    inputCols=[
       'remodel_index'
    ], 
    outputCols= [
        'remodel_vec'],
    handleInvalid='keep'
)

In [30]:
from pyspark.ml.feature import VectorAssembler
# Vector assembler is used to create a vector of input features
 
assembler = VectorAssembler(
    inputCols=[
        "remodel_vec",
        "lot_sqft",
        "yr_built",
        "gross_area",
        "living_area"   
    ],
    outputCol="features"
)

In [31]:
from pyspark.ml.classification import LogisticRegression
lr_model = LogisticRegression(labelCol='total_value')

In [32]:
from pyspark.ml import Pipeline

# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data
# https://spark.apache.org/docs/latest/ml-pipeline.html
 
pipe = Pipeline(stages=[
    remodel_indexer,
    data_encoder,
    assembler,
    lr_model
    ]
)

In [33]:
# run the pipeline
fit_model=pipe.fit(train_data)

# Store the results in a dataframe
results = fit_model.transform(test_data)

In [34]:
results.select(['total_value','prediction']).show()

+-----------+----------+
|total_value|prediction|
+-----------+----------+
|        144|     220.0|
|        182|     220.0|
|        301|     182.0|
|        242|     236.0|
|        262|     226.0|
|        286|     266.0|
|        203|     182.0|
|        268|     182.0|
|        368|     266.0|
|        364|     445.0|
|        315|     310.0|
|        301|     261.0|
|        233|     256.0|
|        287|     204.0|
|        291|     229.0|
|        358|     310.0|
|        284|     261.0|
|        271|     310.0|
|        343|     409.0|
|        284|     310.0|
+-----------+----------+
only showing top 20 rows



## Model Evaluation

### Area under the ROC

In [36]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

AUC_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='total_value',metricName='areaUnderROC')

AUC = AUC_evaluator.evaluate(results)

In [37]:
print("The area under the curve is {}".format(AUC))

The area under the curve is 1.0


A roughly 73% area under ROC denotes the model has performed reasonably well in predicting whether an incident has met the sla.

### Area under the PR

In [39]:
PR_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='total_value',metricName='areaUnderPR')
PR = PR_evaluator.evaluate(results)

In [None]:
print("The area under the PR curve is {}".format(PR))

Accuracy

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

ACC_evaluator = MulticlassClassificationEvaluator(  #  Multiclass or Binary, the accuracy is calculated in the same way.
    labelCol="total_value", predictionCol="prediction", metricName="accuracy")

accuracy = ACC_evaluator.evaluate(results)

In [41]:
print("The accuracy of the model is {}".format(accuracy))

The accuracy of the model is 0.00909090909090909


### Confusion Matrix

In [42]:
from sklearn.metrics import confusion_matrix

In [43]:
y_true = results.select("total_value")
y_true = y_true.toPandas()
 
y_pred = results.select("prediction")
y_pred = y_pred.toPandas()
 
cnf_matrix = confusion_matrix(y_true, y_pred)


In [44]:
print("Below is the confusion matrix \n {}".format(cnf_matrix))

Below is the confusion matrix 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [45]:
tn = cnf_matrix[0][0]
fp = cnf_matrix[0][1]
fn = cnf_matrix[1][0]
tp = cnf_matrix[1][1]

accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*(precision*recall)/(precision+recall)

  accuracy = (tp+tn)/(tp+tn+fp+fn)
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)


In [46]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")


Accuracy: nan
Precision: nan
Recall: nan
F1 Score: nan


4) Using your predictive model, determine the price of a home with the following:
LOT SQRT: 7500
YR Built: 1990
Gross Area: 2100
Living Area: 1900
Remodel: Recent


In [47]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

# Create a test dataframe with the given features
test_df = spark.createDataFrame([(7500, 1990, 2100, 1900, 'Recent')], ['lot_sqft', 'yr_built', 'gross_area', 'living_area', 'remodel'])


# Apply the model to the test data 
predictions = fit_model.transform(test_df)
predictions.show()

+--------+--------+----------+-----------+-------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|lot_sqft|yr_built|gross_area|living_area|remodel|remodel_index|  remodel_vec|            features|       rawPrediction|         probability|prediction|
+--------+--------+----------+-----------+-------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|    7500|    1990|      2100|       1900| Recent|          1.0|(4,[1],[1.0])|[0.0,1.0,0.0,0.0,...|[-1.7744629985916...|[2.21737381137757...|     478.0|
+--------+--------+----------+-----------+-------+-------------+-------------+--------------------+--------------------+--------------------+----------+

