In [1]:
# specail thanks to Prof. I-Cheng Yeh Department of Information Management Chung-Hua University
# dataset link : https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("concrete_strength").getOrCreate()

In [4]:
# importing the data
data = spark_session.read.csv("concrete_Data.csv" , header=True , inferSchema=True)

In [5]:
#assembling the features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=data.columns[:8] , outputCol = "features")

In [6]:
data = va.transform(data)
data.select(data.columns[5:]).show(truncate=False)

+------------------+---------------+-----+------------------------------+---------------------------------------------+
|Coarse Aggregate  |Fine Aggregate |Age  |Concrete compressive strength |features                                     |
+------------------+---------------+-----+------------------------------+---------------------------------------------+
|1040.0            |676.0          |28.0 |79.99                         |[540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0]  |
|1055.0            |676.0          |28.0 |61.89                         |[540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0]  |
|932.0             |594.0          |270.0|40.27                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0]|
|932.0             |594.0          |365.0|41.05                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0]|
|978.4             |825.5          |360.0|44.3                          |[198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0]|
|932.0             |670.0          |90.0

In [7]:
test_data , train_data = data.randomSplit([0.3,0.7])
print(test_data.count())
print(train_data.count())

305
725


In [8]:
# building the lineare regression model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features" , labelCol=train_data.columns[8])
lr= lr.fit(train_data)

In [9]:
eva = lr.evaluate(test_data)

In [10]:
res = eva.predictions.select(eva.predictions.columns[8:])
res = res.withColumnRenamed("Concrete compressive strength " , "label")
res.show(truncate=False)

+-----+-----------------------------------------------+------------------+
|label|features                                       |prediction        |
+-----+-----------------------------------------------+------------------+
|20.59|[108.3,162.4,0.0,203.5,0.0,938.2,849.0,28.0]   |14.450748097290868|
|10.09|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,7.0]    |16.10998077865885 |
|22.35|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,28.0]   |18.521377104117192|
|31.03|[133.0,210.0,0.0,196.0,3.0,949.0,795.0,28.0]   |23.377705726421077|
|29.07|[136.0,162.0,126.0,172.0,10.0,923.0,764.0,28.0]|34.19226530159101 |
|29.07|[136.4,161.6,125.8,171.6,10.4,922.6,764.4,28.0]|34.350068635376566|
|36.44|[139.9,132.6,103.3,200.3,7.4,916.0,753.4,28.0] |24.552648750092438|
|44.61|[141.9,166.6,129.7,173.5,10.9,882.6,785.3,28.0]|35.61294266210955 |
|44.61|[142.0,167.0,130.0,174.0,11.0,883.0,785.0,28.0]|35.645924763564096|
|15.42|[143.6,0.0,174.9,158.4,17.9,942.7,844.5,28.0]  |28.169742370186576|
|23.74|[146.0,173.0,0.0,1

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
print(evaluator.evaluate(res, {evaluator.metricName: "r2"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"}))
print(evaluator.evaluate(res, {evaluator.metricName: "rmse"}))

0.6242151666782827
8.496247217239556
8.496247217239556
10.594480547836248
