In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, when

# Initialize Spark Session
spark = SparkSession.builder.appName("HousingPriceClassification").getOrCreate()

# Load data
data = spark.read.csv('housing_data.csv', inferSchema=True, header=True)

# Preprocess data
# Convert Neighborhood to numeric
indexer = StringIndexer(inputCol="Neighborhood", outputCol="NeighborhoodIndex")
data = indexer.fit(data).transform(data)

# Create binary target variable (1 if price above median, else 0)
median_price = data.approxQuantile("Price", [0.5], 0)[0]
data = data.withColumn("label", when(col("Price") > median_price, 1).otherwise(0))

# Assemble features
assembler = VectorAssembler(inputCols=["SquareFeet", "Bedrooms", "Bathrooms", "NeighborhoodIndex", "YearBuilt"], outputCol="features")
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Split data
train_data, test_data = data.randomSplit([0.7, 0.3])

# Model 1: Logistic Regression
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="label")

# Model 2: Random Forest Classifier
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label")

# Pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[assembler, scaler, lr])
model_lr = pipeline_lr.fit(train_data)
predictions_lr = model_lr.transform(test_data)

# Pipeline for Random Forest
pipeline_rf = Pipeline(stages=[assembler, scaler, rf])
model_rf = pipeline_rf.fit(train_data)
predictions_rf = model_rf.transform(test_data)

# Evaluate models
evaluator = BinaryClassificationEvaluator(labelCol="label")
accuracy_lr = evaluator.evaluate(predictions_lr)
accuracy_rf = evaluator.evaluate(predictions_rf)

print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(f"Random Forest Classifier Accuracy: {accuracy_rf}")


23/12/12 00:03:41 WARN Utils: Your hostname, Hadis-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.83.92.176 instead (on interface en0)
23/12/12 00:03:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/12 00:03:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/12 00:03:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/12 00:03:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Logistic Regression Accuracy: 0.8829173025712131
Random Forest Classifier Accuracy: 0.8704215489887038
