In [1]:
# specail thanks to Prof. I-Cheng Yeh Department of Information Management Chung-Hua Universit
# dataset link : https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("concrete_strength").getOrCreate()

In [4]:
# importing the data
data = spark_session.read.csv("concrete_Data.csv" , header=True , inferSchema=True)

In [5]:
#assembling the features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=data.columns[:8] , outputCol = "features")

In [6]:
data = va.transform(data)
data.select(data.columns[5:]).show(truncate=False)

+------------------+---------------+-----+------------------------------+---------------------------------------------+
|Coarse Aggregate  |Fine Aggregate |Age  |Concrete compressive strength |features                                     |
+------------------+---------------+-----+------------------------------+---------------------------------------------+
|1040.0            |676.0          |28.0 |79.99                         |[540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0]  |
|1055.0            |676.0          |28.0 |61.89                         |[540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0]  |
|932.0             |594.0          |270.0|40.27                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0]|
|932.0             |594.0          |365.0|41.05                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0]|
|978.4             |825.5          |360.0|44.3                          |[198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0]|
|932.0             |670.0          |90.0

In [7]:
test_data , train_data = data.randomSplit([0.3,0.7])
print(test_data.count())
print(train_data.count())

316
714


In [8]:
# building the lineare regression model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features" , labelCol=train_data.columns[8])
lr= lr.fit(train_data)

In [9]:
eva = lr.evaluate(test_data)

In [10]:
res = eva.predictions.select(eva.predictions.columns[8:])
res = res.withColumnRenamed("Concrete compressive strength " , "label")
res.show(truncate=False)

+-----+-----------------------------------------------+------------------+
|label|features                                       |prediction        |
+-----+-----------------------------------------------+------------------+
|7.68 |[102.0,153.0,0.0,192.0,0.0,887.0,942.0,7.0]    |11.691430295637815|
|2.33 |[108.3,162.4,0.0,203.5,0.0,938.2,849.0,3.0]    |10.45709581188882 |
|7.72 |[108.3,162.4,0.0,203.5,0.0,938.2,849.0,7.0]    |10.924844710159896|
|22.35|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,28.0]   |17.3689239250253  |
|31.02|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,90.0]   |24.619031848226996|
|13.66|[133.0,200.0,0.0,192.0,0.0,927.4,839.2,7.0]    |19.069361872114214|
|36.59|[133.0,200.0,0.0,192.0,0.0,927.4,839.2,90.0]   |28.775151511239073|
|14.59|[139.6,209.4,0.0,192.0,0.0,1047.0,806.9,7.0]   |23.67640423568075 |
|35.23|[140.0,164.0,128.0,237.0,6.0,869.0,656.0,28.0] |19.488239600790394|
|4.83 |[141.3,212.0,0.0,203.5,0.0,971.8,748.5,3.0]    |18.24278769470787 |
|44.61|[142.0,167.0,130.0

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
print(evaluator.evaluate(res, {evaluator.metricName: "r2"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"}))
print(evaluator.evaluate(res, {evaluator.metricName: "rmse"}))

0.5557299440297251
8.939286961500184
8.939286961500184
11.25741015631604
