In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("concrete_strength").getOrCreate()

In [3]:
# importing the data
data = spark_session.read.csv("Concrete_Data2.csv" , header=True , inferSchema=True)

In [4]:
#assembling the features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=data.columns[:8] , outputCol = "features")

In [5]:
data = va.transform(data)
data.select(data.columns[5:]).show(truncate=False)

+------------------+---------------+-----+------------------------------+---------------------------------------------+
|Coarse Aggregate  |Fine Aggregate |Age  |Concrete compressive strength |features                                     |
+------------------+---------------+-----+------------------------------+---------------------------------------------+
|1040.0            |676.0          |28.0 |79.99                         |[540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0]  |
|1055.0            |676.0          |28.0 |61.89                         |[540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0]  |
|932.0             |594.0          |270.0|40.27                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0]|
|932.0             |594.0          |365.0|41.05                         |[332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0]|
|978.4             |825.5          |360.0|44.3                          |[198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0]|
|932.0             |670.0          |90.0

In [6]:
test_data , train_data = data.randomSplit([0.7,0.3])

In [7]:
train_data.show()

+------+-------------------+--------+-----+-----------------+------------------+---------------+----+------------------------------+--------------------+
|Cement|Blast Furnace Slag |Fly Ash |Water|Superplasticizer |Coarse Aggregate  |Fine Aggregate |Age |Concrete compressive strength |            features|
+------+-------------------+--------+-----+-----------------+------------------+---------------+----+------------------------------+--------------------+
| 102.0|              153.0|     0.0|192.0|              0.0|             887.0|          942.0|90.0|                         25.46|[102.0,153.0,0.0,...|
| 116.0|              173.0|     0.0|192.0|              0.0|             909.8|          891.9|90.0|                         31.02|[116.0,173.0,0.0,...|
| 132.0|              206.5|   160.9|178.9|              5.5|             866.9|          735.6|28.0|                         33.31|[132.0,206.5,160....|
| 133.0|              200.0|     0.0|192.0|              0.0|             92

In [8]:
# building the lineare regression model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features" , labelCol=train_data.columns[8])
lr= lr.fit(train_data)

In [9]:
eva = lr.evaluate(test_data)

In [12]:
res = eva.predictions.select(eva.predictions.columns[8:])
res = res.withColumnRenamed("Concrete compressive strength " , "label")
res.show(truncate=False)

+-----+----------------------------------------------+------------------+
|label|features                                      |prediction        |
+-----+----------------------------------------------+------------------+
|4.57 |[102.0,153.0,0.0,192.0,0.0,887.0,942.0,3.0]   |12.259692975263633|
|7.68 |[102.0,153.0,0.0,192.0,0.0,887.0,942.0,7.0]   |12.771290193025415|
|17.28|[102.0,153.0,0.0,192.0,0.0,887.0,942.0,28.0]  |15.45717558627478 |
|2.33 |[108.3,162.4,0.0,203.5,0.0,938.2,849.0,3.0]   |11.345001523983264|
|7.72 |[108.3,162.4,0.0,203.5,0.0,938.2,849.0,7.0]   |11.856598741745046|
|20.59|[108.3,162.4,0.0,203.5,0.0,938.2,849.0,28.0]  |14.542484134994403|
|29.23|[108.3,162.4,0.0,203.5,0.0,938.2,849.0,90.0]  |22.472241010302035|
|6.28 |[116.0,173.0,0.0,192.0,0.0,909.8,891.9,3.0]   |15.488717991665922|
|10.09|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,7.0]   |16.000315209427704|
|22.35|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,28.0]  |18.68620060267707 |
|3.32 |[122.6,183.9,0.0,203.5,0.0,958.

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
print(evaluator.evaluate(res, {evaluator.metricName: "r2"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"})) 
print(evaluator.evaluate(res, {evaluator.metricName: "mae"}))
print(evaluator.evaluate(res, {evaluator.metricName: "rmse"}))

0.5911779551992935
8.305452315462146
8.305452315462146
10.605790545594454
