In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
fatalities_df = pd.read_csv("./fatalities.csv")

In [3]:
fatalities_df.head()

Unnamed: 0,Year,ICD10 Code,ICD10 Diagnosis,Diagnosis Type,Metric,Sex,Value
0,2014,All codes,All deaths,All deaths,Number of observed deaths,,459087
1,2014,C33-C34 & C00-C14 & C15 & C32 & C53 & C67 & C6...,All deaths which can be caused by smoking,All deaths which can be caused by smoking,Number of observed deaths,,235820
2,2014,C00-D48,All cancers,All cancers,Number of observed deaths,,136312
3,2014,J00-J99,All respiratory diseases,All respiratory diseases,Number of observed deaths,,61744
4,2014,I00-I99,All circulatory diseases,All circulatory diseases,Number of observed deaths,,126101


In [4]:
fatalities_df.shape

(1749, 7)

In [5]:
fatalities_df.columns

Index(['Year', 'ICD10 Code', 'ICD10 Diagnosis', 'Diagnosis Type', 'Metric',
       'Sex', 'Value'],
      dtype='object')

In [6]:
# Information about Dataset
fatalities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1749 entries, 0 to 1748
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Year             1749 non-null   int64 
 1   ICD10 Code       1749 non-null   object
 2   ICD10 Diagnosis  1749 non-null   object
 3   Diagnosis Type   1749 non-null   object
 4   Metric           1749 non-null   object
 5   Sex              1166 non-null   object
 6   Value            1749 non-null   object
dtypes: int64(1), object(6)
memory usage: 95.8+ KB


### *Exploratory of Data Analysis*

In [7]:
# Summary Statistics of Numerical Columns
fatalities_df.describe()

Unnamed: 0,Year
count,1749.0
mean,2009.0
std,3.163182
min,2004.0
25%,2006.0
50%,2009.0
75%,2012.0
max,2014.0


In [8]:
# Summary Statistics of categorical Columns
fatalities_df.describe(include='object')

Unnamed: 0,ICD10 Code,ICD10 Diagnosis,Diagnosis Type,Metric,Sex,Value
count,1749,1749,1749,1749,1166,1749
unique,27,27,10,2,2,1024
top,All codes,All deaths,Cancers which can be caused by smoking,Number of observed deaths,Male,100
freq,66,66,726,891,583,70


### *Handling Missing Value*

In [11]:
# Check missing value
fatalities_df.isnull().sum()

Year                 0
ICD10 Code           0
ICD10 Diagnosis      0
Diagnosis Type       0
Metric               0
Sex                583
Value                0
dtype: int64

In [12]:
fatalities_df.isnull().mean()

Year               0.000000
ICD10 Code         0.000000
ICD10 Diagnosis    0.000000
Diagnosis Type     0.000000
Metric             0.000000
Sex                0.333333
Value              0.000000
dtype: float64

In [14]:
# Fill Null Value with Unknown
fatalities_df['Sex'] = fatalities_df['Sex'].fillna("Unknown")

In [15]:
fatalities_df['Sex']

0       Unknown
1       Unknown
2       Unknown
3       Unknown
4       Unknown
         ...   
1744     Female
1745     Female
1746     Female
1747     Female
1748     Female
Name: Sex, Length: 1749, dtype: object