In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Setting default options
warnings.filterwarnings("ignore")
pd.set_option("mode.copy_on_write", True)

In [2]:
# loading the Data
df = pd.read_csv("/Users/revanth/Downloads/insurance_data_cleaned (1).csv")
df.sample(5)

Unnamed: 0,YEAR,SEX,RACENEW,INCFAM97ON2,HINOTCOVE,CNLUNG,SMK,Occupation_Code
1388662,2015,2,100.0,10.0,1.0,1.0,2,1
849694,2009,2,100.0,10.0,2.0,1.0,2,28
815109,2009,2,100.0,20.0,1.0,1.0,1,1
381295,2004,1,100.0,0.0,1.0,1.0,0,0
1001021,2011,2,100.0,20.0,1.0,1.0,2,2


In [3]:
df.describe()

Unnamed: 0,YEAR,SEX,RACENEW,INCFAM97ON2,HINOTCOVE,CNLUNG,SMK,Occupation_Code
count,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0
mean,2008.943,1.513977,147.8627,22.20655,1.153564,1.000926,0.4867907,2.696913
std,5.46955,0.4998047,104.7671,20.76822,0.3605306,0.0304234,0.7823743,6.277511
min,2000.0,1.0,100.0,0.0,1.0,1.0,0.0,0.0
25%,2004.0,1.0,100.0,10.0,1.0,1.0,0.0,0.0
50%,2009.0,2.0,100.0,20.0,1.0,1.0,0.0,0.0
75%,2014.0,2.0,100.0,31.0,1.0,1.0,1.0,1.0
max,2018.0,2.0,541.0,96.0,2.0,2.0,2.0,33.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697896 entries, 0 to 1697895
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   YEAR             int64  
 1   SEX              int64  
 2   RACENEW          float64
 3   INCFAM97ON2      float64
 4   HINOTCOVE        float64
 5   CNLUNG           float64
 6   SMK              int64  
 7   Occupation_Code  int64  
dtypes: float64(4), int64(4)
memory usage: 103.6 MB


In [5]:
# check if any actual null values
df.isnull().sum()

YEAR               0
SEX                0
RACENEW            0
INCFAM97ON2        0
HINOTCOVE          0
CNLUNG             0
SMK                0
Occupation_Code    0
dtype: int64

Column by column checks

In [6]:
df["YEAR"].value_counts().sort_index()

YEAR
2000     97463
2001     97425
2002     90187
2003     89081
2004     91281
2005     95362
2006     73159
2007     73312
2008     71748
2009     85511
2010     86990
2011     98195
2012    104116
2013    100617
2014    107565
2015     99511
2016     92693
2017     74441
2018     69239
Name: count, dtype: int64

In [7]:
df["SEX"].value_counts()

SEX
2    872680
1    825216
Name: count, dtype: int64

In [8]:
df["RACENEW"].value_counts().sort_index()

RACENEW
100.0    1285378
200.0     242032
300.0      17587
400.0      95747
520.0      16430
530.0       3443
541.0      37279
Name: count, dtype: int64

consider 530 and 541 as the NaN values

In [9]:
df["INCFAM97ON2"].value_counts().sort_index()

INCFAM97ON2
0.0     175037
10.0    519508
20.0    456265
30.0    114916
31.0    115156
32.0    219791
96.0     97223
Name: count, dtype: int64

0 as Nan, should we consider 96 as well?

In [10]:
df["HINOTCOVE"].value_counts().sort_index()

HINOTCOVE
1.0    1437160
2.0     260736
Name: count, dtype: int64

In [11]:
df["CNLUNG"].value_counts().sort_index()

CNLUNG
1.0    1696323
2.0       1573
Name: count, dtype: int64

In [12]:
df["SMK"].value_counts().sort_index()

SMK
0    1178936
1     211400
2     307560
Name: count, dtype: int64

In [13]:
df["Occupation_Code"].value_counts().sort_index()

Occupation_Code
0     1253029
1       52589
2       30911
3       36977
4       29478
5       23563
6       25299
7       18055
8       15435
9       19849
10      20819
11      17514
12      11752
13      13359
14      10436
15       6501
16       6320
17       7049
18       6934
19      12328
20      10480
21       8820
22      10194
23       3445
24       1614
25      10909
26       3606
27       7954
28       3632
29       7725
30       4012
31       2593
32       1799
33       2916
Name: count, dtype: int64

## Analysis

Only The following columns are required for the analysis:
- Race
- Income
- Smoke
- Occupation

In [14]:
# creating a subset of the data with only the columns we need
required_columns = ["RACENEW", "INCFAM97ON2", "SMK", "Occupation_Code"]

df_analysis = df[required_columns]

# rename the columns
col_map = {
    "RACENEW": "race",
    "INCFAM97ON2": "inc",
    "SMK": "smk",
    "Occupation_Code": "occ",
}
df_analysis.rename(columns=col_map, inplace=True)
df_analysis.sample(5)

Unnamed: 0,race,inc,smk,occ
738483,100.0,10.0,0,0
1362255,100.0,0.0,0,0
98392,100.0,96.0,1,0
126772,100.0,30.0,1,29
1550784,100.0,31.0,0,0


In [15]:
# Remapping rules,
# Add the values which need to be considered as 1 in remapping, rest will be considered as 0
remap_race = [530, 541]
remap_smk = [0]
remap_inc = [0]
remap_occ = [0]

In [16]:
def converter(val, remap_list):
    # takes in a value and  a list of values to consider as "missing" which will be marked as 1 and the rest will be marked as 0
    if val in remap_list:
        return 1
    else:
        return 0

In [17]:
# applying remapping rules
df_analysis["inc"] = df_analysis["inc"].apply(lambda x: converter(x, remap_inc))
df_analysis["occ"] = df_analysis["occ"].apply(lambda x: converter(x, remap_occ))
df_analysis["race"] = df_analysis["race"].apply(lambda x: converter(x, remap_race))
df_analysis["smk"] = df_analysis["smk"].apply(lambda x: converter(x, remap_smk))

In [18]:
def any_null(row):
    # check if any value in the row is 1 and returns 1 if any value is 1
    if 1 in row.values:
        return 1
    else:
        return 0

In [19]:
# applying funtion to the dataframe
df_analysis["any_null"] = df_analysis.apply(lambda x: any_null(x), axis=1)

In [20]:
df_analysis.sample(5)

Unnamed: 0,race,inc,smk,occ,any_null
912883,0,0,0,0,0
109374,0,0,1,1,1
94230,0,0,0,0,0
1449949,0,0,1,1,1
743520,0,0,0,0,0


In [21]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null
count,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0
mean,0.0239838,0.1030905,0.6943511,0.7379893,0.7587379
std,0.1529987,0.3040772,0.4606818,0.4397286,0.4278491
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,1.0,1.0
75%,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0


In [22]:
# function to exclude columns and calculate the any_nul
def any_null_exclude(row, exclude_lst):
    # check if any value in the row is 1 and returns 1 if any value is 1
    if 1 in row.drop(exclude_lst).values:
        return 1
    else:
        return 0

In [23]:
# excluding the occ and any_null columns
df_analysis["any_null_exclude_occ"] = df_analysis.apply(
    lambda x: any_null_exclude(x, ["occ", "any_null"]), axis=1
)

In [24]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null,any_null_exclude_occ
count,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0
mean,0.0239838,0.1030905,0.6943511,0.7379893,0.7587379,0.719782
std,0.1529987,0.3040772,0.4606818,0.4397286,0.4278491,0.4491058
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,1.0,1.0,1.0,1.0
75%,0.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
# excluding the inc and any_null columns
df_analysis["any_null_exclude_occ_inc"] = df_analysis.apply(
    lambda x: any_null_exclude(x, ["occ", "any_null", "inc", "any_null_exclude_occ"]),
    axis=1,
)

In [26]:
df_analysis.describe()

Unnamed: 0,race,inc,smk,occ,any_null,any_null_exclude_occ,any_null_exclude_occ_inc
count,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0,1697896.0
mean,0.0239838,0.1030905,0.6943511,0.7379893,0.7587379,0.719782,0.6998232
std,0.1529987,0.3040772,0.4606818,0.4397286,0.4278491,0.4491058,0.4583348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,1.0,1.0,1.0,1.0,1.0
75%,0.0,0.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0
