In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Setting default options
warnings.filterwarnings("ignore")
pd.set_option("mode.copy_on_write", True)

In [2]:
# loading the Data
df = pd.read_csv("/Users/revanth/Downloads/insurance_data_cleaned.csv")
df.sample(5)

Unnamed: 0,YEAR,SEX,RACENEW,INCFAM97ON2,HINOTCOVE,CNLUNG,SMK,Occupation_Code
28202,2012,2,100.0,20.0,1.0,1.0,2,1
35163,2014,2,100.0,10.0,1.0,1.0,2,1
13752,2006,2,100.0,10.0,1.0,1.0,2,1
29656,2013,2,100.0,20.0,1.0,1.0,2,5
13700,2006,2,200.0,10.0,1.0,1.0,1,0


In [3]:
df.describe()

Unnamed: 0,YEAR,SEX,RACENEW,INCFAM97ON2,HINOTCOVE,CNLUNG,SMK,Occupation_Code
count,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0
mean,2009.918226,1.603246,121.718078,19.889262,1.049597,1.032198,1.459185,6.947271
std,5.529163,0.489229,72.960737,18.287774,0.217113,0.176527,0.512591,7.390755
min,2000.0,1.0,100.0,0.0,1.0,1.0,0.0,0.0
25%,2005.0,1.0,100.0,10.0,1.0,1.0,1.0,1.0
50%,2011.0,2.0,100.0,20.0,1.0,1.0,1.0,4.0
75%,2015.0,2.0,100.0,30.0,1.0,1.0,2.0,10.0
max,2018.0,2.0,541.0,96.0,2.0,2.0,2.0,33.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48854 entries, 0 to 48853
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YEAR             48854 non-null  int64  
 1   SEX              48854 non-null  int64  
 2   RACENEW          48854 non-null  float64
 3   INCFAM97ON2      48854 non-null  float64
 4   HINOTCOVE        48854 non-null  float64
 5   CNLUNG           48854 non-null  float64
 6   SMK              48854 non-null  int64  
 7   Occupation_Code  48854 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 3.0 MB


In [17]:
# check if any actual null values
df.isnull().sum()

YEAR               0
SEX                0
RACENEW            0
INCFAM97ON2        0
HINOTCOVE          0
CNLUNG             0
SMK                0
Occupation_Code    0
dtype: int64

Column by column checks

In [5]:
df["YEAR"].value_counts().sort_index()

YEAR
2000    2142
2001    2311
2002    2240
2003    2125
2004    2313
2005    2408
2006    1722
2007    1770
2008    1842
2009    2297
2010    2315
2011    2850
2012    3101
2013    3072
2014    3426
2015    3277
2016    3666
2017    3053
2018    2924
Name: count, dtype: int64

In [6]:
df["SEX"].value_counts()

SEX
2    29471
1    19383
Name: count, dtype: int64

In [7]:
df["RACENEW"].value_counts().sort_index()

RACENEW
100.0    43001
200.0     3886
300.0      268
400.0      911
520.0       75
530.0       38
541.0      675
Name: count, dtype: int64

consider 530 and 540 as the NaN values

In [8]:
df["INCFAM97ON2"].value_counts().sort_index()

INCFAM97ON2
0.0      3880
10.0    19753
20.0    12727
30.0     2018
31.0     3148
32.0     5344
96.0     1984
Name: count, dtype: int64

0 as Nan, should we consider 96 as well?

In [9]:
df["HINOTCOVE"].value_counts().sort_index()

HINOTCOVE
1.0    46431
2.0     2423
Name: count, dtype: int64

In [10]:
df["CNLUNG"].value_counts().sort_index()

CNLUNG
1.0    47281
2.0     1573
Name: count, dtype: int64

In [11]:
df["SMK"].value_counts().sort_index()

SMK
0      352
1    25717
2    22785
Name: count, dtype: int64

In [12]:
df["Occupation_Code"].value_counts().sort_index()

Occupation_Code
0     7773
1     6292
2     3947
3     3585
4     3393
5     3034
6     2093
7     2070
8     1650
9     1507
10    1346
11    1208
12    1161
13    1119
14     848
15     808
16     697
17     680
18     674
19     611
20     553
21     524
22     516
23     444
24     373
25     370
26     363
27     363
28     290
29     226
30     111
31      96
32      70
33      59
Name: count, dtype: int64

## Analysis

Only The following columns are required for the analysis:
- Race
- Income
- Smoke
- Occupation

In [13]:
# creating a subset of the data with only the columns we need
required_columns = ["RACENEW", "INCFAM97ON2", "SMK", "Occupation_Code"]

df_analysis = df[required_columns]

# rename the columns
col_map = {
    "RACENEW": "race",
    "INCFAM97ON2": "inc",
    "SMK": "smk",
    "Occupation_Code": "occ",
}
df_analysis.rename(columns=col_map, inplace=True)
df_analysis.sample(5)

Unnamed: 0,race,inc,smk,occ
48258,100.0,10.0,1,11
42280,100.0,20.0,2,0
2240,100.0,10.0,1,20
14963,200.0,10.0,1,8
46582,100.0,20.0,1,12


In [14]:
# Remapping rules,
# Add the values which need to be considered as 1 in remapping, rest will be considered as 0
remap_race = [530, 540]
remap_smk = [0]
remap_inc = [0]
remap_occ = [0]

In [15]:
def converter(val, remap_list):
    # takes in a value and  a list of values to consider as "missing" which will be marked as 1 and the rest will be marked as 0
    if val in remap_list:
        return 1
    else:
        return 0

In [16]:
# applying remapping rules
df_analysis["inc"] = df_analysis["inc"].apply(lambda x: converter(x, remap_inc))
df_analysis["occ"] = df_analysis["occ"].apply(lambda x: converter(x, remap_occ))
df_analysis["race"] = df_analysis["race"].apply(lambda x: converter(x, remap_race))
df_analysis["smk"] = df_analysis["smk"].apply(lambda x: converter(x, remap_smk))

In [18]:
def any_null(row):
    # check if any value in the row is 1 and returns 1 if any value is 1
    if 1 in row.values:
        return 1
    else:
        return 0

In [19]:
# applying funtion to the dataframe
df_analysis["any_null"] = df_analysis.apply(lambda x: any_null(x), axis=1)

In [20]:
df_analysis.sample(5)

Unnamed: 0,race,inc,smk,occ,any_null
37358,0,0,0,0,0
10276,0,0,0,0,0
46214,0,0,0,0,0
30238,0,0,0,0,0
17996,0,0,0,0,0


In [21]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null
count,48854.0,48854.0,48854.0,48854.0,48854.0
mean,0.000778,0.07942,0.007205,0.159107,0.226512
std,0.027879,0.270396,0.084578,0.365779,0.418578
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [23]:
# function to exclude columns and calculate the any_nul
def any_null_exclude(row, exclude_lst):
    # check if any value in the row is 1 and returns 1 if any value is 1
    if 1 in row.drop(exclude_lst).values:
        return 1
    else:
        return 0

In [24]:
# excluding the occ and any_null columns
df_analysis["any_null_exclude_occ"] = df_analysis.apply(
    lambda x: any_null_exclude(x, ["occ", "any_null"]), axis=1
)

In [25]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null,any_null_exclude_occ
count,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0
mean,0.000778,0.07942,0.007205,0.159107,0.226512,0.085766
std,0.027879,0.270396,0.084578,0.365779,0.418578,0.280021
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
# excluding the inc and any_null columns
df_analysis["any_null_exclude_occ_inc"] = df_analysis.apply(
    lambda x: any_null_exclude(x, ["occ", "any_null", "inc", "any_null_exclude_occ"]),
    axis=1,
)

In [28]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null,any_null_exclude_occ,any_null_exclude_occ_inc
count,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0,48854.0
mean,0.000778,0.07942,0.007205,0.159107,0.226512,0.085766,0.007983
std,0.027879,0.270396,0.084578,0.365779,0.418578,0.280021,0.088991
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0
