In [1]:
#special thanks to the Marine Resources Division
# link to the dataset : https://archive.ics.uci.edu/ml/datasets/Abalone

In [2]:
# importing pysaprk
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("age_abalone").getOrCreate()
spark_session

In [3]:
# reading the data
data = spark_session.read.csv("abalone.csv", inferSchema=True)
data = data.withColumnRenamed("_c0",  "sex")
data = data.withColumnRenamed("_c1",  "Length")
data = data.withColumnRenamed("_c2",  "Diameter")
data = data.withColumnRenamed("_c3",  "Height")
data = data.withColumnRenamed("_c4",  "Whole_weight")
data = data.withColumnRenamed("_c5",  "Shucked_weight")
data = data.withColumnRenamed("_c6",  "Viscera_weight ")
data = data.withColumnRenamed("_c7",  "Shell_weight")
data = data.withColumnRenamed("_c8",  "Rings")
print(data.count())
data.show()

4177
+---+------+--------+------+------------+--------------+---------------+------------+-----+
|sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight |Shell_weight|Rings|
+---+------+--------+------+------------+--------------+---------------+------------+-----+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|          0.101|        0.15|   15|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|         0.0485|        0.07|    7|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|         0.1415|        0.21|    9|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|          0.114|       0.155|   10|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|         0.0395|       0.055|    7|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|         0.0775|        0.12|    8|
|  F|  0.53|   0.415|  0.15|      0.7775|         0.237|         0.1415|        0.33|   20|
|  F| 0.545|   0.425| 0.125|       0.768|         0.294|         0.1495|   

In [4]:
# converting the sex into int 
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol = "sex" , outputCol="sex_indexed")
indexed_data = str_indexer.fit(data).transform(data)
indexed_data.show()

+---+------+--------+------+------------+--------------+---------------+------------+-----+-----------+
|sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight |Shell_weight|Rings|sex_indexed|
+---+------+--------+------+------------+--------------+---------------+------------+-----+-----------+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|          0.101|        0.15|   15|        0.0|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|         0.0485|        0.07|    7|        0.0|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|         0.1415|        0.21|    9|        2.0|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|          0.114|       0.155|   10|        0.0|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|         0.0395|       0.055|    7|        1.0|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|         0.0775|        0.12|    8|        1.0|
|  F|  0.53|   0.415|  0.15|      0.7775|         0.237|        

In [5]:
# assembling the features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=["Length","Diameter","Height","Whole_weight","Shucked_weight",
                                "Viscera_weight ","Shell_weight","sex_indexed"] , outputCol = "features")
finalized_data = va.transform(indexed_data)
finalized_data = finalized_data.select(["features" , "Rings"])


In [6]:
finalized_data.show(truncate = False)

+-------------------------------------------------+-----+
|features                                         |Rings|
+-------------------------------------------------+-----+
|[0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0.0]  |15   |
|[0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0.0]  |7    |
|[0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,2.0]   |9    |
|[0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0.0]  |10   |
|[0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1.0]  |7    |
|[0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,1.0]   |8    |
|[0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,2.0]   |20   |
|[0.545,0.425,0.125,0.768,0.294,0.1495,0.26,2.0]  |16   |
|[0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0.0]|9    |
|[0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,2.0]    |19   |
|[0.525,0.38,0.14,0.6065,0.194,0.1475,0.21,2.0]   |14   |
|[0.43,0.35,0.11,0.406,0.1675,0.081,0.135,0.0]    |10   |
|[0.49,0.38,0.135,0.5415,0.2175,0.095,0.19,0.0]   |11   |
|[0.535,0.405,0.145,0.6845,0.2725,0.171,0.205,2.0]|10   |
|[0.47,0.355,0

In [7]:
#splitting the data into test and train
test_data , train_data = finalized_data.randomSplit([0.3,0.7])
print(test_data.count())
print(train_data.count())

1255
2922


In [10]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(labelCol = "Rings")
regressor = regressor.fit(train_data)
predictions = regressor.evaluate(test_data).predictions

In [17]:
# evaluating the accuracy of the model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="Rings")
print(evaluator.evaluate(predictions, {evaluator.metricName: "r2"})) #  the higher the better
print(evaluator.evaluate(predictions, {evaluator.metricName: "mse"})) # the lower the better
print(evaluator.evaluate(predictions, {evaluator.metricName: "mae"})) # the lower the better
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})) # the lower the better

0.47867822002945615
5.468289381433705
1.6323583236769363
2.3384373802677945
