# Insurance Risk Score prediction


In [15]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
spark = SparkSession.builder.appName("Insurance Prediction").getOrCreate()
spark

In [17]:
data = spark.read.csv("./data/insurance.csv", header=True, inferSchema=True)
data.show()

+-------------------+------------------+--------+------+-------------+------------------+-------+----------+
|Monthly Installment|Insurance Category|Diabetes|Smoker|User Category|               Job|College|Risk Score|
+-------------------+------------------+--------+------+-------------+------------------+-------+----------+
|                800|            Health|     Yes|    No|   Individual|          Engineer|    Yes|       Low|
|                950|              Life|      No|   Yes|       Family|           Teacher|    Yes|      High|
|               1200|              Auto|      No|    No|   Individual|            Doctor|    Yes|    Medium|
|                700|            Health|     Yes|   Yes|       Family| Marketing Manager|    Yes|      High|
|                850|              Life|      No|    No|       Family|        Accountant|    Yes|       Low|
|               1100|              Auto|     Yes|    No|   Individual|Software Developer|     No|    Medium|
|                75

In [18]:
# Exploratory Data Analysis (EDA) using pandas
df = pd.read_csv("./data/insurance.csv")
df.head()

Unnamed: 0,Monthly Installment,Insurance Category,Diabetes,Smoker,User Category,Job,College,Risk Score
0,800,Health,Yes,No,Individual,Engineer,Yes,Low
1,950,Life,No,Yes,Family,Teacher,Yes,High
2,1200,Auto,No,No,Individual,Doctor,Yes,Medium
3,700,Health,Yes,Yes,Family,Marketing Manager,Yes,High
4,850,Life,No,No,Family,Accountant,Yes,Low


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Monthly Installment  69 non-null     int64 
 1   Insurance Category   69 non-null     object
 2   Diabetes             69 non-null     object
 3   Smoker               69 non-null     object
 4   User Category        69 non-null     object
 5   Job                  69 non-null     object
 6   College              69 non-null     object
 7   Risk Score           69 non-null     object
dtypes: int64(1), object(7)
memory usage: 4.4+ KB


In [20]:
df.describe()

Unnamed: 0,Monthly Installment
count,69.0
mean,929.710145
std,153.011159
min,680.0
25%,800.0
50%,920.0
75%,1050.0
max,1220.0


In [21]:
df.isnull().sum()

Monthly Installment    0
Insurance Category     0
Diabetes               0
Smoker                 0
User Category          0
Job                    0
College                0
Risk Score             0
dtype: int64

In [22]:
data.printSchema()

root
 |-- Monthly Installment: integer (nullable = true)
 |-- Insurance Category: string (nullable = true)
 |-- Diabetes: string (nullable = true)
 |-- Smoker: string (nullable = true)
 |-- User Category: string (nullable = true)
 |-- Job: string (nullable = true)
 |-- College: string (nullable = true)
 |-- Risk Score: string (nullable = true)



In [23]:
# Perform StringIndexing for categorical variables
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data)
            for column in ["Insurance Category", "Diabetes", "Smoker", "User Category", "Job", "College", "Risk Score"]]
pipeline = Pipeline(stages=indexers)
data = pipeline.fit(data).transform(data)

In [24]:
# Create feature vector
assembler = VectorAssembler(inputCols=["Monthly Installment", "Insurance Category_index", "Diabetes_index",
                            "Smoker_index", "User Category_index", "Job_index", "College_index"], outputCol="features")
data = assembler.transform(data)

In [25]:
# Display the updated schema to identify the correct label column
data.printSchema()

root
 |-- Monthly Installment: integer (nullable = true)
 |-- Insurance Category: string (nullable = true)
 |-- Diabetes: string (nullable = true)
 |-- Smoker: string (nullable = true)
 |-- User Category: string (nullable = true)
 |-- Job: string (nullable = true)
 |-- College: string (nullable = true)
 |-- Risk Score: string (nullable = true)
 |-- Insurance Category_index: double (nullable = false)
 |-- Diabetes_index: double (nullable = false)
 |-- Smoker_index: double (nullable = false)
 |-- User Category_index: double (nullable = false)
 |-- Job_index: double (nullable = false)
 |-- College_index: double (nullable = false)
 |-- Risk Score_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [26]:
# Split data into train and test sets
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed=42)
print(f"Training Dataset Count: {train_data.count()}")
print(f"Test Dataset Count: {test_data.count()}")

Training Dataset Count: 40
Test Dataset Count: 29


In [27]:
data.columns

['Monthly Installment',
 'Insurance Category',
 'Diabetes',
 'Smoker',
 'User Category',
 'Job',
 'College',
 'Risk Score',
 'Insurance Category_index',
 'Diabetes_index',
 'Smoker_index',
 'User Category_index',
 'Job_index',
 'College_index',
 'Risk Score_index',
 'features']

In [28]:
# Train Random Forest Classifier
rf = RandomForestClassifier(
    labelCol="Risk Score_index", featuresCol="features", numTrees=10)
model = rf.fit(train_data)

In [29]:
# Make predictions on test data
predictions = model.transform(test_data)

In [31]:
# Evaluate model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Risk Score_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"\nModel accuracy: {accuracy}")


Model accuracy: 0.9655172413793104
