In [40]:
# based on several features of individual such as age, physical/family condition and location against their existing medical expense to be used for predicting future medical expenses of individuals that help medical insurance to make decision on charging the premium.

In [41]:
import numpy as np
import pandas as pd
import pyspark

In [42]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType


In [43]:
print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('insurance').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.22.3'), ('pandas', '1.4.0'), ('pyspark', '3.2.1')]




In [44]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)


In [3]:
df = spark.read.csv('insurance (1).csv',header=True,inferSchema=True)
print(df.count())
df.show()

NameError: name 'spark' is not defined

In [2]:
df1=spark.sql("select * from df limit 5")
df1.show()

NameError: name 'spark' is not defined

In [46]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- expenses: double (nullable = true)



In [47]:
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+---+---+---+--------+------+------+--------+
|age|sex|bmi|children|smoker|region|expenses|
+---+---+---+--------+------+------+--------+
|  0|  0|  0|       0|     0|     0|       0|
+---+---+---+--------+------+------+--------+



In [48]:
print(df.columns)

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'expenses']


In [49]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

In [50]:
gender_indexer = StringIndexer(inputCol='sex', outputCol='Sex_index')

gender_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='Sex_vec')

In [51]:
smoker_indexer = StringIndexer(inputCol='smoker', outputCol='smoker_index')

smoker_encoder = OneHotEncoder(inputCol='smoker_index', outputCol='smoker_vec')

In [52]:
region_indexer = StringIndexer(inputCol='region', outputCol='region_index')

region_encoder = OneHotEncoder(inputCol='region_index', outputCol='region_vec')

In [137]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- expenses: double (nullable = true)



In [53]:
df.columns

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'expenses']

In [54]:
assembler =VectorAssembler(inputCols=["age","Sex_vec","bmi","children","smoker_vec","region_vec"],
                           outputCol="features")


In [55]:
from pyspark.ml.regression import LinearRegression

In [56]:
lr=LinearRegression(featuresCol="features",labelCol="expenses")

In [57]:
from pyspark.ml import Pipeline

In [58]:
pipeline=Pipeline(stages=[gender_indexer,smoker_indexer,region_indexer,
                          gender_encoder,smoker_encoder,region_encoder,
                          assembler,lr])

In [59]:
train, test = df.randomSplit([0.7, 0.3])

In [60]:
lr_model = pipeline.fit(train)

In [108]:
results = lr_model.transform(test)
results['expenses','prediction'].show()

+--------+------------------+
|expenses|        prediction|
+--------+------------------+
|21344.85| 2355.759450123973|
| 2203.74|2425.8105007416543|
| 1622.19|1942.5975089009225|
| 4561.19| 5278.309936281703|
| 2205.98| 2986.218905683112|
|11482.63|  2818.23564162195|
| 2219.45| 4625.908166408246|
| 1631.82| 4359.358755210958|
| 1633.04|  4674.58848299053|
|  2217.6| 5928.363031625766|
|13747.87| 23651.08904428072|
|11884.05|-57.10393701466455|
| 1121.87|-800.3559112622897|
|  1708.0|  873.724941079041|
| 1712.23|1924.4907003442768|
| 1719.44|1988.0313221715605|
| 3481.87| 4440.389674069354|
|33732.69| 27188.66710047367|
|34617.84| 27819.12655603281|
| 1727.54| 4019.511790084347|
+--------+------------------+
only showing top 20 rows



In [62]:
from pyspark.ml.evaluation import RegressionEvaluator

In [63]:

lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="expenses",metricName="r2")

In [64]:
auc =lr_evaluator.evaluate(results)
auc

0.7073391572746766

In [65]:
from pyspark.ml.regression import DecisionTreeRegressor

In [122]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol = 'expenses', maxDepth = 10)
pipeline2 = Pipeline(stages=[gender_indexer,smoker_indexer,region_indexer,
                             gender_encoder,smoker_encoder,region_encoder,
                             assembler, dt])


In [123]:
dt_model = pipeline2.fit(train)

In [124]:
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="expenses", predictionCol="prediction", metricName="r2")
auc1 =dt_evaluator.evaluate(dt_predictions)
auc1

0.7032640419862741

In [125]:
results1 = dt_model.transform(test)
results1['expenses','prediction'].show(5)

+--------+------------------+
|expenses|        prediction|
+--------+------------------+
|21344.85|3281.1500000000005|
| 2203.74|3281.1500000000005|
| 1622.19|         1618.7475|
| 4561.19|           3393.36|
| 2205.98|3281.1500000000005|
+--------+------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [102]:
rf=RandomForestRegressor(featuresCol="features", labelCol = 'expenses', numTrees = 100, maxDepth = 7)

In [103]:
pipeline3 = Pipeline(stages=[gender_indexer,smoker_indexer,region_indexer,
                             gender_encoder,smoker_encoder,region_encoder,
                             assembler, rf])


In [104]:
rf_model = pipeline3.fit(train)

In [105]:
rf_predictions = rf_model.transform(test)
rf_evaluator = RegressionEvaluator(labelCol="expenses", predictionCol="prediction", metricName="r2")
auc2 =rf_evaluator.evaluate(rf_predictions)
auc2

0.82197262374243

In [114]:
results2 = rf_model.transform(test)
results2['expenses','prediction'].show(5)

+--------+------------------+
|expenses|        prediction|
+--------+------------------+
|21344.85| 3826.799419105805|
| 2203.74| 3826.799419105805|
| 1622.19|3655.6983962400964|
| 4561.19|10339.310276641354|
| 2205.98|3999.4246894474477|
+--------+------------------+
only showing top 5 rows



In [115]:
from pyspark.ml.regression import GBTRegressor

In [132]:
gb=GBTRegressor(featuresCol="features", labelCol = 'expenses',maxIter=20)

In [133]:
pipeline4 = Pipeline(stages=[gender_indexer,smoker_indexer,region_indexer,
                             gender_encoder,smoker_encoder,region_encoder,
                             assembler, gb])


In [134]:
gb_model = pipeline4.fit(train)

In [135]:
gb_predictions = gb_model.transform(test)
gb_evaluator = RegressionEvaluator(labelCol="expenses", predictionCol="prediction", metricName="r2")
auc3 =gb_evaluator.evaluate(gb_predictions)
auc3

0.8034173591520071