In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [2]:
spark=SparkSession.builder.appName('Demo').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-03-16 20:00:20,238 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.csv("file:///home/hdoop/python_operation/ML_Operation/headbrain.csv", inferSchema=True, header=True)

                                                                                

In [5]:
df.show()

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
|     1|        1|           3585|               1300|
|     1|        1|           3785|               1400|
|     1|        1|           3559|               1255|
|     1|        1|           3613|               1355|
|     1|        1|           3982|               1375|
|     1|        1|           3443|               1340|
|     1|        1|           3993|               1380|
|     1|        1|           3640|               1355|
|     1|        1|           4208|               1522|
|     1|        1|           3832|               1208|
|     1|  

In [6]:
df.printSchema()

root
 |-- Gender: integer (nullable = true)
 |-- Age Range: integer (nullable = true)
 |-- Head Size(cm^3): integer (nullable = true)
 |-- Brain Weight(grams): integer (nullable = true)



In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
df.columns

['Gender', 'Age Range', 'Head Size(cm^3)', 'Brain Weight(grams)']

In [12]:
assembler = VectorAssembler(inputCols=['Gender','Age Range'], outputCol='features')

In [18]:
output = assembler.transform(df)

In [22]:
final_df = output.select('features', 'Gender')

In [23]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

In [25]:
train_data.describe().show()

+-------+-------------------+
|summary|             Gender|
+-------+-------------------+
|  count|                157|
|   mean| 1.4203821656050954|
| stddev|0.49519988886943983|
|    min|                  1|
|    max|                  2|
+-------+-------------------+



In [26]:
test_data.describe().show()

+-------+------------------+
|summary|            Gender|
+-------+------------------+
|  count|                80|
|   mean|            1.4625|
| stddev|0.5017374874664179|
|    min|                 1|
|    max|                 2|
+-------+------------------+



In [27]:
from pyspark.ml.regression import LinearRegression

In [28]:
lm = LinearRegression(labelCol='Gender')

In [29]:
model = lm.fit(train_data)

2022-03-16 20:20:40,643 WARN util.Instrumentation: [9beb6469] regParam is zero, which might cause numerical instability and overfitting.
2022-03-16 20:20:40,977 WARN netlib.InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
2022-03-16 20:20:40,993 WARN netlib.InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
2022-03-16 20:20:41,288 WARN netlib.InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [30]:
import pandas as pd

In [32]:
pd.DataFrame({"Coefficients":model.coefficients}, index=['Gender','Age Range'])

Unnamed: 0,Coefficients
Gender,1.0
Age Range,1.774539e-15


In [33]:
res = model.evaluate(test_data)

In [34]:
res.residuals.show()



+--------------------+
|           residuals|
+--------------------+
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|1.221245327087672...|
|-6.66133814775093...|
|-6.66133814775093...|
+--------------------+
only showing top 20 rows



In [35]:
unlabeled_data = test_data.select('features')

In [36]:
predictions = model.transform(unlabeled_data)

In [37]:
predictions.show()

+---------+------------------+
| features|        prediction|
+---------+------------------+
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,2.0]|1.0000000000000007|
|[1.0,2.0]|1.0000000000000007|
+---------+------------------+
only showing top 20 rows



In [38]:
print("MAE: ", res.meanAbsoluteError)
print("MSE: ", res.meanSquaredError)
print("RMSE: "), res.rootMeanSquaredError
print("R2", res.r2)
print("Adj R2", res.r2)

MAE:  8.382183835919932e-16
MSE:  7.984135177451724e-31
RMSE: 
R2 1.0
Adj R2 1.0
