In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, GBTRegressor

In [None]:
spark = SparkSession.builder \
                    .master("local") \
                    .appName("Sprint 18") \
                    .getOrCreate()

In [None]:
data = spark.read.load('/datasets/housing.csv', 
                                            format="csv", sep=",", inferSchema=True, header="true")

In [None]:
data.printSchema()

In [None]:
data.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in data.columns)).show()# пропуски только в столбце total_bedrooms

In [None]:
total_bedrooms_median = data.approxQuantile('total_bedrooms', [0.5], 0)[0]

In [None]:
data = data.fillna(total_bedrooms_median, subset=['total_bedrooms'])

In [None]:
categorical_cols = ["ocean_proximity"]
numerical_cols = ["longitude", "latitude", "housing_median_age", "total_rooms",
                  "total_bedrooms", "population", "households", "median_income"]
target = "median_house_value"

In [None]:
data_train, data_test = data.randomSplit([.8,.2], seed=12345)

In [None]:
indexer = StringIndexer(inputCols=categorical_cols, 
                        outputCols=[c+'_idx' for c in categorical_cols], handleInvalid="skip") 

indexer = indexer.fit(data_train)
data_train = indexer.transform(data_train)
data_test = indexer.transform(data_test)

cols = [c for c in data_train.columns for i in categorical_cols if (c.startswith(i))]

In [None]:
encoder = OneHotEncoder(inputCols=[c+'_idx' for c in categorical_cols],
                        outputCols=[c+'_ohe' for c in categorical_cols])
encoder = encoder.fit(data_train)
data_train = encoder.transform(data_train)
data_test = encoder.transform(data_test)

cols = [c for c in data_train.columns for i in categorical_cols if (c.startswith(i))]

In [None]:
categorical_assembler = VectorAssembler(inputCols=[c+'_ohe' for c in categorical_cols], outputCol="categorical_features")
data_train = categorical_assembler.transform(data_train)
data_test = categorical_assembler.transform(data_test) 

In [None]:
numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol="numerical_features")
data_train = numerical_assembler.transform(data_train)
data_test = numerical_assembler.transform(data_test)

In [None]:
standardScaler = StandardScaler(inputCol='numerical_features', outputCol="numerical_features_scaled")
standardScaler = standardScaler.fit(data_train)
data_train = standardScaler.transform(data_train)
data_test = standardScaler.transform(data_test)

In [None]:
all_features_1 = ['categorical_features','numerical_features_scaled']
all_features_2 = ['numerical_features_scaled']

final_assembler_1 = VectorAssembler(inputCols=all_features_1, outputCol="features")
final_assembler_2 = VectorAssembler(inputCols=all_features_2, outputCol="features")

train_data_with_categories = final_assembler_1.transform(data_train)
test_data_with_categories = final_assembler_1.transform(data_test)

train_data_without_categories = final_assembler_2.transform(data_train)
test_data_without_categories = final_assembler_2.transform(data_test)

In [None]:
lrc = LinearRegression(labelCol=target, featuresCol='features', maxIter=20, regParam=0.3, elasticNetParam=0.8)#c

In [None]:
lrcModel = lrc.fit(train_data_with_categories)

In [None]:
predc = lrcModel.evaluate(test_data_with_categories)

In [None]:
eval = RegressionEvaluator(
    labelCol=target, 
    predictionCol="prediction", 
    metricName="rmse")

In [None]:
rmse = eval.evaluate(predc.predictions)
mae = eval.evaluate(predc.predictions, {eval.metricName: "mae"})
r2 = eval.evaluate(predc.predictions, {eval.metricName: "r2"})

In [None]:
print(f"RMSE = {rmse:.2f}")
print(f"MAE = {mae:.2f}")
print(f"R2 = {r2:.2f}")

In [None]:
lr = LinearRegression(labelCol=target, featuresCol='features', maxIter=20, regParam=0.3, elasticNetParam=0.8)#без

In [None]:
lrModel = lr.fit(train_data_without_categories)

In [None]:
pred = lrModel.evaluate(test_data_without_categories)

In [None]:
eval = RegressionEvaluator(
    labelCol=target, 
    predictionCol="prediction", 
    metricName="rmse")

In [None]:
rmse = eval.evaluate(pred.predictions)
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})

In [None]:
print(f"RMSE = {rmse:.2f}")
print(f"MAE = {mae:.2f}")
print(f"R2 = {r2:.2f}")

In [None]:
spark.stop()