Dataset of Insurance

START OF DATA TRANSFORM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


In [13]:
# Load the data
data = pd.read_csv('insurance_premium_dataset.csv')

# Display the first 5 rows of the data
print(data.head())

    Age  Gender  Annual Income Marital Status  Number of Dependents  \
0  56.0    Male        99990.0        Married                   1.0   
1  46.0    Male         2867.0         Single                   1.0   
2  32.0  Female        30154.0       Divorced                   3.0   
3  60.0  Female        48371.0       Divorced                   0.0   
4  25.0  Female        54174.0       Divorced                   0.0   

  Education Level     Occupation  Health Score  Location    Policy Type  \
0        Master's            NaN     31.074627     Urban  Comprehensive   
1      Bachelor's            NaN     50.271335     Urban  Comprehensive   
2      Bachelor's            NaN     14.714909  Suburban  Comprehensive   
3             PhD  Self-Employed     25.346926     Rural  Comprehensive   
4     High School  Self-Employed      6.659499     Urban  Comprehensive   

   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
0              NaN           13         320.0        

In [12]:
# Display the data info
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278860 entries, 0 to 278859
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Age                   274175 non-null  float64
 1   Gender                278860 non-null  object 
 2   Annual Income         264905 non-null  float64
 3   Marital Status        273841 non-null  object 
 4   Number of Dependents  250974 non-null  float64
 5   Education Level       278860 non-null  object 
 6   Occupation            197572 non-null  object 
 7   Health Score          268263 non-null  float64
 8   Location              278860 non-null  object 
 9   Policy Type           278860 non-null  object 
 10  Previous Claims       197572 non-null  float64
 11  Vehicle Age           278860 non-null  int64  
 12  Credit Score          250974 non-null  float64
 13  Insurance Duration    278860 non-null  int64  
 14  Premium Amount        277019 non-null  float64
 15  

In [None]:
# Display unique values of marital_status
print(data['Marital Status'].unique())

In [None]:
# Display Education level unique values
print(data['Education Level'].unique())

In [None]:
# Display the unique values of the occupation
print(data['Occupation'].unique())

In [None]:
# Display the unique values of the Health Score
print(data['Health Score'].unique())

In [None]:
# Display the unique values of the Policy Type
print(data['Policy Type'].unique())

In [14]:
# Display the unique values of the Location
print(data['Location'].unique())

['Urban' 'Suburban' 'Rural']


In [15]:
# Display the unique values of the Previous Claims
print(data['Previous Claims'].unique())

[nan  2.  1.  0.  3.  4.  5.  6.  7.  9.  8.]


In [18]:
# Display the unique values of the Premium Amount
print(data['Customer Feedback'].unique())

['Poor' 'Good' 'Average' nan]


In [20]:
# Display the unique values of the Smoking Status
print(data['Smoking Status'].unique())

['Yes' 'No']


In [21]:
# Display the unique values of the Exercise Frequency
print(data['Exercise Frequency'].unique())

['Daily' 'Monthly' 'Rarely' 'Weekly']


In [22]:
# Display the unique values of the Property Type
print(data['Property Type'].unique())

['Condo' 'House' 'Apartment']


Age to Int
Convert gender to binary
Convert marital status to 0, 1, 2
Convert number of dependents to int
Convert Education level to 0, 1, 2, 3
Convert Occupation to 0, 1, 2
Convert Policy Type to 0, 1, 2
Convert Location to 0, 1, 2
Convert Previous Claims to int
Credit Score to int
Premium Amount to int
Customer Feedback to int
Smoking Status to binary
Exercise Frequency to int
Property Type to int

In [28]:
# Check for missing values
print(data.isnull().sum())

Age                      4685
Gender                      0
Annual Income           13955
Marital Status           5019
Number of Dependents    27886
Education Level             0
Occupation              81288
Health Score            10597
Location                    0
Policy Type                 0
Previous Claims         81288
Vehicle Age                 0
Credit Score            27886
Insurance Duration          0
Premium Amount           1841
Policy Start Date           0
Customer Feedback       18349
Smoking Status              0
Exercise Frequency          0
Property Type               0
dtype: int64


In [32]:
new_data = data.dropna()

print(new_data.head())

     Age  Gender  Annual Income Marital Status  Number of Dependents  \
3   60.0  Female        48371.0       Divorced                   0.0   
5   38.0    Male        31769.0        Married                   1.0   
6   56.0  Female        26209.0         Single                   1.0   
8   40.0    Male        29471.0         Single                   4.0   
11  41.0  Female        36427.0         Single                   2.0   

   Education Level     Occupation  Health Score  Location    Policy Type  \
3              PhD  Self-Employed     25.346926     Rural  Comprehensive   
5       Bachelor's       Employed     14.736611     Rural  Comprehensive   
6         Master's       Employed     33.238300     Urban        Premium   
8      High School     Unemployed     31.565248     Rural          Basic   
11     High School  Self-Employed     13.264415  Suburban        Premium   

    Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
3               1.0           11        

In [34]:
print(new_data.describe())

                Age  Annual Income  Number of Dependents  Health Score  \
count  92985.000000   92985.000000          92985.000000  92985.000000   
mean      41.014121   42098.392095              2.000635     28.584177   
std       13.552250   35484.380179              1.412383     16.011769   
min       18.000000       0.000000              0.000000      0.116550   
25%       29.000000   13614.000000              1.000000     16.082971   
50%       41.000000   32138.000000              2.000000     26.448279   
75%       53.000000   62119.000000              3.000000     39.021673   
max       64.000000  149996.000000              4.000000     93.876090   

       Previous Claims   Vehicle Age  Credit Score  Insurance Duration  \
count     92985.000000  92985.000000  92985.000000        92985.000000   
mean          0.997720      9.488434    574.973544            5.013206   
std           1.000675      5.763703    158.820093            2.580619   
min           0.000000      0.000000 

In [36]:
# Convert Age from float64 to int
print(new_data['Age'])

new_data['Age'] = new_data['Age'].astype(int)

3         60.0
5         38.0
6         56.0
8         40.0
11        41.0
          ... 
278847    59.0
278850    56.0
278852    47.0
278853    28.0
278854    63.0
Name: Age, Length: 92985, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['Age'] = new_data['Age'].astype(int)
