# Handling missing values

In [49]:
import pandas as pd

In [50]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


## Check and handle missing values

In [51]:
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [52]:
df.dropna(subset = ["date"])

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [53]:
df.isnull().sum().sum()

1318

In [54]:
# df = df.fillna(method='bfill')

In [55]:
df['time'].isna().sum()

1064

In [56]:
df.isnull().sum().sum()

1318

In [57]:
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1693 non-null   int64  
 1   date            1690 non-null   object 
 2   time            629 non-null    object 
 3   country_name    1693 non-null   object 
 4   state/province  1692 non-null   object 
 5   population      1693 non-null   int64  
 6   landslide_type  1692 non-null   object 
 7   trigger         1691 non-null   object 
 8   fatalities      1446 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 119.2+ KB


In [59]:
df.isna().sum()

id                   0
date                 3
time              1064
country_name         0
state/province       1
population           0
landslide_type       1
trigger              2
fatalities         247
dtype: int64

In [60]:
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [61]:
mean = df['fatalities'].mean()

In [62]:
mean

1.4591977869986168

In [63]:
df['fatalities'] = df['fatalities'].fillna(mean)

In [64]:
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,1.459198
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,1.459198
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,1.459198
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,1.459198
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.000000
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.000000
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.000000
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.000000
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.000000


In [None]:
df[a]