<a href="https://colab.research.google.com/github/Maxzor/training_ML_adameo/blob/main/linear_regression_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

spark=SparkSession.builder.getOrCreate()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
df=spark.read.csv('insurance.csv', inferSchema=True, header=True)
df.show()

+---+------+------+--------+------+---------+-----------+
|age|gender|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [8]:
from pyspark.ml.feature import StringIndexer
new_col=StringIndexer(inputCol='gender', outputCol='gender_num')
new_df=new_col.fit(df).transform(df)

new_col=StringIndexer(inputCol='smoker', outputCol='smoker_num')
new_df=new_col.fit(new_df).transform(new_df)

new_col=StringIndexer(inputCol='region', outputCol='region_num')
new_df=new_col.fit(new_df).transform(new_df)

new_df.show()

+---+------+------+--------+------+---------+-----------+----------+----------+----------+
|age|gender|   bmi|children|smoker|   region|    charges|gender_num|smoker_num|region_num|
+---+------+------+--------+------+---------+-----------+----------+----------+----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|       1.0|       1.0|       2.0|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|       0.0|       0.0|       0.0|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|       0.0|       0.0|       0.0|
| 33|  male|22.705|       0|    no|northwest|21984.47061|       0.0|       0.0|       1.0|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|       0.0|       0.0|       1.0|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|       1.0|       0.0|       0.0|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|       1.0|       0.0|       0.0|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|       1.0|       0.0|       1.0|

In [10]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['age','bmi','children','gender_num','smoker_num','region_num'],
                          outputCol='features')
output=assembler.transform(new_df)
output.show(truncate=False)

+---+------+------+--------+------+---------+-----------+----------+----------+----------+-----------------------------+
|age|gender|bmi   |children|smoker|region   |charges    |gender_num|smoker_num|region_num|features                     |
+---+------+------+--------+------+---------+-----------+----------+----------+----------+-----------------------------+
|19 |female|27.9  |0       |yes   |southwest|16884.924  |1.0       |1.0       |2.0       |[19.0,27.9,0.0,1.0,1.0,2.0]  |
|18 |male  |33.77 |1       |no    |southeast|1725.5523  |0.0       |0.0       |0.0       |[18.0,33.77,1.0,0.0,0.0,0.0] |
|28 |male  |33.0  |3       |no    |southeast|4449.462   |0.0       |0.0       |0.0       |[28.0,33.0,3.0,0.0,0.0,0.0]  |
|33 |male  |22.705|0       |no    |northwest|21984.47061|0.0       |0.0       |1.0       |[33.0,22.705,0.0,0.0,0.0,1.0]|
|32 |male  |28.88 |0       |no    |northwest|3866.8552  |0.0       |0.0       |1.0       |[32.0,28.88,0.0,0.0,0.0,1.0] |
|31 |female|25.74 |0       |no  

In [11]:
final_data=output.select('features','charges')
final_data.show(5)

+--------------------+-----------+
|            features|    charges|
+--------------------+-----------+
|[19.0,27.9,0.0,1....|  16884.924|
|[18.0,33.77,1.0,0...|  1725.5523|
|[28.0,33.0,3.0,0....|   4449.462|
|[33.0,22.705,0.0,...|21984.47061|
|[32.0,28.88,0.0,0...|  3866.8552|
+--------------------+-----------+
only showing top 5 rows



In [12]:
train_data, test_data = final_data.randomSplit([0.6,0.4])
from pyspark.ml.regression import LinearRegression

lr=LinearRegression(featuresCol='features',labelCol='charges')
trained_model=lr.fit(train_data)

score=trained_model.evaluate(train_data)
score.r2

0.7688237511934666

In [13]:
unlabeled_data=test_data.select('features')
pred=trained_model.transform(unlabeled_data)
pred.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(6,[0,1],[18.0,37...|  4201.642154034673|
|(6,[0,1],[18.0,43...|  6207.397926164233|
|(6,[0,1],[18.0,53...|  9756.042753778065|
|(6,[0,1],[21.0,23...| 25.292233285179464|
|(6,[0,1],[21.0,31...| 2763.9203067697654|
|(6,[0,1],[21.0,36...|  4808.248305286432|
|(6,[0,1],[27.0,32...|  4864.293816176743|
|(6,[0,1],[33.0,30...|  5537.494949260761|
|(6,[0,1],[36.0,29...| 6105.5288741254135|
|(6,[0,1],[37.0,36...|   8634.92191656478|
|(6,[0,1],[42.0,24...|  5930.141026693082|
|(6,[0,1],[48.0,40...| 12813.470608101312|
|(6,[0,1],[49.0,36...| 11909.935502088169|
|(6,[0,1],[52.0,34...|  11706.52489921068|
|(6,[0,1],[58.0,36...| 13922.615087778975|
|(6,[0,1],[62.0,38...| 15901.447489856893|
|(6,[0,1],[62.0,39...| 16287.169753727963|
|(6,[0,1],[63.0,41...| 17080.812608747525|
|[18.0,15.96,0.0,0...|-1833.6790158438089|
|[18.0,20.79,0.0,1...|-1547.3921918216165|
+----------