In [None]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.3'
spark_version = 'spark-3.4.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

# Import Required Libraries

In [None]:
# General purpose libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# Machine learning and preprocessing libraries
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn import svm

# Spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, StringType
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer
import pyspark.pandas as pspd

## Read in the data to process

In [None]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

# Read in the CSV into a DataFrame.
file_path = "original_extracted_df.csv"
home_df = spark.read.csv(file_path, sep=",", header=True)
home_df.show()
print(home_df.count())

## Process the Data

The Target(X) is the Price and the Features(y) are all the other columns

In [None]:
# Drop the non beneficial columns
home_narrow_df = home_df.drop("City","Street", "Latitude", "Longitude", "MarketEstimate", "Zipcode", "PPSq", "LotArea", "LotUnit", "RentEstimate")
home_narrow_df.show()

In [None]:
# Drop rows with NaN values
null_replacements = ["", "null", "None", "NULL", "nan"]
for col_name in home_narrow_df.columns:
    home_narrow_df = home_narrow_df.withColumn(
        col_name, when(col(col_name).isin(null_replacements), None).otherwise(col(col_name)))
home_cleaned_df = home_narrow_df.dropna(how="any")

# Print the first few rows of the cleaned DataFrame
print(home_cleaned_df.count())
home_cleaned_df.show()

In [None]:
home_cleaned_df.groupby("Area").count().show()

In [None]:
distinct_prices = home_cleaned_df.select('price').distinct()
distinct_prices.show()

In [None]:
distinct_bedrooms = home_cleaned_df.select('bedroom').distinct()
distinct_bedrooms.show()

In [None]:
# take off the outliers of bedroom >10 and prices < 1000
home_cleaned_df = home_cleaned_df.filter((home_cleaned_df['price'] >= 1000) & (home_cleaned_df['bedroom'] <= 10)) \
    .withColumn("index", monotonically_increasing_id()) \
    .drop("index")

home_cleaned_df.show()


In [None]:
# Print the Schema
home_cleaned_df.printSchema()

In [None]:
#Convert all but states to integers
col_type_map = {
    "State": StringType(),
    "Bedroom": IntegerType(),
    "Bathroom": FloatType(),
    "Area": IntegerType(),
    "ConvertedLot": FloatType(),
    "Price": IntegerType()
}

home_converted_df = home_cleaned_df.select(
  [home_cleaned_df[c].cast(col_type_map[c])
  for c in col_type_map]
)
home_converted_df.printSchema()

In [None]:
# Convert the PySpark DataFrame to a Pandas DataFrame using toPandas()
home_pd_df = home_converted_df.toPandas()
print(home_pd_df.shape)
home_pd_df.head()

## Encode the variables features using get_dummies

In [None]:
# Get dummies for states
home_encoded_df = pd.get_dummies(home_pd_df)
home_encoded_df.head()

In [None]:
# Separate the Features (y) from the Target(X)
y = home_encoded_df["Price"]

X = home_encoded_df.drop(columns="Price")

In [None]:
# Display sample data for the Features
X[:5]

In [None]:
# Display sample data for the target
y[:5]


# Separate the data into Training and Testing subsets

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a StandardScaler instances
X_scaler = StandardScaler()

In [None]:
# Fit the scaler
X_scaler.fit(X_train)

In [None]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# **Random Forest Regression Model**
Random Forest Regression is an ensemble learning technique based on Decision Tree Regression. It combines the predictions of multiple individual decision trees to improve the overall predictive accuracy and reduce overfitting. Random Forest Regression is a powerful and widely used regression technique suitable for a wide range of regression problems, including prediction, forecasting, and modeling complex systems.





In [None]:
# Create the Random Forest regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model.fit(X_train, y_train.ravel())

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(10)

In [None]:
# Evaluate the Random Forest regression model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
std_dev = np.std(y_test - y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")
print(f"Standard Deviation: {std_dev}")

This model seems to have moderate performance. While the R^2 score  of approximately 0.65 explains approximately the variance in the target variable. The model suggests moderate prediction.

In [None]:
# Plotting predicted vs actual prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test.ravel(), y_pred.ravel(), color='blue') # Flatten y_test and y_pred

# Extract the min and max from the y_test array directly
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Home Prices Random Forest')
plt.savefig("random_forest.png") # Save the image
plt.show()

In [None]:
# Display top 10 feature importances
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:10]  # Select top 10 indices
# Instead of X.columns, use the 'features' variable that already contains the column names
features = X.columns
features = features

plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances")
plt.bar(range(10), importances[indices], align="center")  # Display only top 10
plt.xticks(range(10), [features[i] for i in indices.astype(int)], rotation=90)
plt.xlim([-1, 10])  # Limit x-axis to top 10
plt.tight_layout()  # Adjust layout for better fit
plt.savefig("top_10_feature_importances.png")
plt.show()


## **Decision Tree Regression Model**
Decision Tree Regression is a supervised machine learning algorithm used for regression tasks. It works by recursively partitioning the feature space into regions and fitting a simple model (usually a constant value) to each region.

In [None]:
# Train the Decision Tree regression model
decision_model = DecisionTreeRegressor(random_state=42)

In [None]:
# Fit the data into the model
decision_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
predictions = decision_model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

In [None]:
# Evaluate the model
y_pred = decision_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
std_dev = np.std(y_test - y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")
print(f"Standard Deviation: {std_dev}")

R² score being negative (50%) suggests that the model performs worse than a horizontal line, which is not uncommon in some scenarios but generally indicates a poor fit.

In [None]:
# Plotting predicted vs actual prices
plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Home Prices Decision Tree')
plt.tight_layout()  # Adjust layout for better fit
plt.savefig("actual_vs_predicted_prices.png") # Save the image
plt.show()

# **Linear Regression Model**

Linear Regression is a fundamental supervised machine learning algorithm used for predictive analysis. It models the relationship between a dependent variable (target) and one or more independent variables (features) by fitting a linear equation to observed data

In [None]:
# Linear Regression Train, Test, Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create an instance of the LinearRegression model
linear_model = LinearRegression()

In [None]:
# Fit the model
linear_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
predictions = linear_model.predict(X_test)
y_pred = predictions
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(10)

In [None]:
# Calculate metrics
train_score = linear_model.score(X_train, y_train)
test_score = linear_model.score(X_test, y_test)

# Generate predictions for training data
training_predictions = linear_model.predict(X_train)  # Predict on the training data

r2_train = r2_score(y_train, training_predictions)

# Generate predictions for testing data
testing_predictions = linear_model.predict(X_test)   # Predict on the testing data

r2_test = r2_score(y_test, testing_predictions)
mse = mean_squared_error(y_test, testing_predictions)
rmse = mean_squared_error(y_test, testing_predictions, squared=False)
std = y_test.std()

In [None]:
# Print metrics
print(f"The train_score is {train_score}.")
print(f"The test_score is {test_score}.")
print(f"The r2_train is {r2_train}.")
print(f"The r2_test is {r2_test}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

A high MSE value indicates that the model's predictions are far from the actual values on average. The R^2 train is 0.41 and the R^2 test is 0.30. The model performed poorly

In [None]:
# Plotting predicted vs actual prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue')  # Remove .values
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Home Prices Linear Regression')
plt.savefig("linear_regression.png") # Save the image
plt.show()

#Support Vector Machine(SVM)
Its primary objective is to find the optimal hyperplane that separates data points belonging to different classes in a high-dimensional space. It is used for classification and regression tasks.

In [None]:
# Generate the data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initializing and fitting the SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)


In [None]:
# Making predictions on the testing set
y_pred = svm_model.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
results.head(10)

In [None]:
# Evaluate the model
y_pred = svm_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
std_dev = np.std(y_test - y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")
print(f"Standard Deviation: {std_dev}")

A lower MSE indicates better model performance. A lower RMSE values indicate better model performance. An R^2 score of 0.46 suggests that the model explains approximately 46% of the variance in the target variable. Std measures the dispersion of the actual target values around the mean. It provides context for interpreting the MSE and RMSE values.

In [None]:
# Plotting predicted vs actual prices
plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted SVM')
plt.tight_layout()  # Adjust layout for better fit
plt.savefig("SVM.png") # Save the image
plt.show()
