# Handling missing values

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


## Check and handle missing values

In [6]:
df_notnull = df[~pd.isnull(df.date)]

In [47]:
df['time'] = df['time'].fillna("Not Known")

In [7]:
df_notnull.time.value_counts()

time
Night            97
Morning          87
Afternoon        58
Early morning    36
3:00:00          12
                 ..
1:13              1
9:40:00           1
11:50:00          1
                  1
21:06             1
Name: count, Length: 159, dtype: int64

In [15]:
df_notnull.time.fillna(df_notnull.time.value_counts()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notnull.time.fillna(df_notnull.time.value_counts()[0], inplace=True)


In [16]:
df2 = df[~pd.isnull(df.date)]

In [21]:
df2[pd.isnull(df2.time)].time.iloc[:] = "Night"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[pd.isnull(df2.time)].time.iloc[:] = "Night"


In [22]:
df3 = df.copy()

In [23]:
mask = pd.isnull(df3.time)

In [24]:
df3.loc[mask, 'time'] = "Night"

In [26]:
df3.time.value_counts()

time
Night            1161
Morning            88
Afternoon          58
Early morning      36
3:00:00            12
                 ... 
1:13                1
9:40:00             1
11:50:00            1
                    1
21:06               1
Name: count, Length: 159, dtype: int64

In [28]:
df2.time.value_counts()

time
Night            97
Morning          87
Afternoon        58
Early morning    36
3:00:00          12
                 ..
1:13              1
9:40:00           1
11:50:00          1
                  1
21:06             1
Name: count, Length: 159, dtype: int64

In [29]:
df4 =  df3[~pd.isnull(df.date)]

In [30]:
mask = pd.isnull(df4.fatalities)

In [33]:
df4.loc[mask, 'fatalities'] = df4.fatalities.mean()

In [34]:
df4.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,1.462231
1,42,3/22/07,Night,United States,Ohio,17288,Landslide,Rain,1.462231
2,56,4/6/07,Night,United States,Pennsylvania,15930,Landslide,Rain,1.462231
3,59,4/14/07,Night,Canada,Quebec,42786,Riverbank collapse,Rain,1.462231
4,61,4/15/07,Night,United States,Kentucky,6903,Landslide,Downpour,0.0


## Try from scratch

In [35]:
sdf = pd.read_csv("./data/landslides.csv")

In [36]:
sdf2 = sdf[~pd.isnull(sdf.date)]

In [37]:
mask = pd.isnull(sdf2.time)

In [38]:
sdf2.loc[mask, 'time'] = "Night"

In [39]:
mask = pd.isnull(sdf2.fatalities)

In [40]:
sdf2.loc[mask, 'fatalities'] = sdf2.fatalities.mean()

In [45]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    display(x)
    pd.reset_option('display.max_rows')

In [46]:
print_full(sdf2)

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,1.462231
1,42,3/22/07,Night,United States,Ohio,17288,Landslide,Rain,1.462231
2,56,4/6/07,Night,United States,Pennsylvania,15930,Landslide,Rain,1.462231
3,59,4/14/07,Night,Canada,Quebec,42786,Riverbank collapse,Rain,1.462231
4,61,4/15/07,Night,United States,Kentucky,6903,Landslide,Downpour,0.0
5,64,4/20/07,Night,United States,Kentucky,6903,Landslide,Rain,1.462231
6,67,4/24/07,Night,United States,South Dakota,2540,Landslide,Rain,1.462231
7,77,5/21/07,Night,Colombia,Risaralda,440118,Mudslide,Rain,13.0
8,105,6/27/07,Night,Ecuador,Zamora-Chinchipe,15276,Landslide,Downpour,1.462231
9,106,6/27/07,Night,Ecuador,Loja,117796,Landslide,Downpour,1.462231
