In [None]:
import os
import sys
import numpy as np
import findspark
findspark.find()

In [None]:
from pyspark import SparkContext
sc = SparkContext(master = "local[4]")

In [None]:
# Import SparkSession
from pyspark.sql import SparkSession

# Build SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Linear Regression Model") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()

In [None]:
# Some tests
TOTAL = 1000
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0 for i in range(TOTAL)]).cache()
print("Number of random points:", dots.count())

stats = dots.stats()
print('Mean: ', stats.mean())
print('stdev: ', stats.stdev())

In [None]:
# Some more tests
rdd1 = sc.parallelize([('a', 7), ('a', 2), ('b', 2)])
rdd2 = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "q", "r"])])
rdd3 = sc.parallelize(range(10))

rdd2.flatMapValues(lambda x: x).collect()

In [None]:
# Read Boston housing data - method 1
housing = spark.read.csv("D:/HP Pavilion D drive/Mit/Udemy/Aurélien Géron_Hands-on ML Scikit-Learn-DL TensorFlow - dwnld from GitHub/datasets/housing/housing.csv", header = True, mode = "DROPMALFORMED")

In [None]:
# Read Boston housing data - method 2
housing = spark.read.format("CSV").option("header", "true").load("D:/HP Pavilion D drive/Mit/Udemy/Aurélien Géron_Hands-on ML Scikit-Learn-DL TensorFlow - dwnld from GitHub/datasets/housing/housing.csv")
type(housing)

In [None]:
# Print DataFrame object
housing.show(3)

In [None]:
housing.columns

In [None]:
housing.printSchema

In [None]:
# Change numeric string variables to float
from pyspark.sql.types import *

housing = housing.withColumn("longitude", housing["longitude"].cast(FloatType())) \
    .withColumn("latitude", housing["latitude"].cast(FloatType())) \
    .withColumn("housing_median_age", housing["housing_median_age"].cast(FloatType())) \
    .withColumn("total_rooms", housing["total_rooms"].cast(FloatType())) \
    .withColumn("total_bedrooms", housing["total_bedrooms"].cast(FloatType())) \
    .withColumn("population", housing["population"].cast(FloatType())) \
    .withColumn("households", housing["households"].cast(FloatType())) \
    .withColumn("median_income", housing["median_income"].cast(FloatType())) \
    .withColumn("median_house_value", housing["median_house_value"].cast(FloatType()))

In [None]:
# Alternatively,
from pyspark.sql.types import *

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df 

# Assign all column names to columns
# columns = ['households', 'housing_median_age', 'latitude', 'longitude', 'median_house_value', 'median_income', 'population', 'total_bedrooms', 'total_rooms']
columns = housing.columns

# Convert the df columns to FloatType()
housing = convertColumn(housing, columns, FloatType())

In [None]:
housing.select('population', 'total_bedrooms').show(10)

In [None]:
housing.groupBy("housing_median_age").count().sort("housing_median_age", ascending = False).show(10)

In [None]:
housing.describe().show()

In [None]:
# Adjust the values of median_house_value by dividing by 1 lakh
housing = housing.withColumn("median_house_value", housing["median_house_value"] / 100000)

In [None]:
# Add new columns
housing = housing.withColumn("roomsPerHousehold", housing["total_rooms"] / housing["households"]) \
    .withColumn("populationPerHousehold", housing["population"] / housing["households"])

# Inspect the result
housing.first()

In [None]:
# Re-order and select columns
housing = housing.select("total_bedrooms", "population", "households", "median_income", "roomsPerHousehold", "populationPerHousehold")

In [None]:
# Standardisation of variables
# Import DenseVector
from pyspark.ml.linalg import DenseVector

# Define the input_data
input_data = housing.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Replace housing with the new DataFrame
housing = spark.createDataFrame(input_data, ["label", "features"])

In [None]:
# Import StandardScaler
from pyspark.ml.feature import StandardScaler

# Initialise the standardScaler
standardScaler = StandardScaler(inputCol = "features", outputCol = "features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(housing)

# Transform the data in housing with the scaler
scaled_housing = scaler.transform(housing)

# Inspect the result
scaled_housing.take(2)

In [None]:
# Split the data into train and test sets
train_data, test_data = scaled_housing.randomSplit([.8, .2], seed = 1234)
print(train_data.head())
print(test_data.head())

In [None]:
sc.stop()