In [4]:
#Handling Missing Data 

import numpy as np
import pandas as pd

vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [6]:
1 + np.nan

nan

In [8]:
1 - np.nan

nan

In [10]:
# aggregates over values involving NaN
vals2.sum(), vals.min(), vals.max()

(nan, nan, nan)

In [12]:

# Using np.nansum, np.nanmin, np.nanmax to ignore NaN values
print("Sum (ignoring NaN):", np.nansum(vals2))
print("Min (ignoring NaN):", np.nanmin(vals2))
print("Max (ignoring NaN):", np.nanmax(vals2))

Sum (ignoring NaN): 8.0
Min (ignoring NaN): 1.0
Max (ignoring NaN): 4.0


In [16]:
x = [1, 2, 3, 4]
x[0] = None
print(x)


[None, 2, 3, 4]


In [22]:
# operating on Null Values
#isnull : Generate boolean mask where values are NaN
#notnull Generate boolean mask for non-NaN values (notnull is opposite of isnull)
#dropna # Drop rows with any NaN values
#isnull()*dropna(): returns filtered version of  

import numpy as np
import pandas as pd
data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [24]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [26]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [57]:
import numpy as np
import pandas as pd

# Create a 3x3 array with some NaN values
data = np.array([
    [1.0, np.nan, 2.0],
    [2.0, 3.0, 5.0],
    [np.nan, 4.0, 6.0]
])

# Convert to DataFrame for better presentation
df = pd.DataFrame(data)

# Display the DataFrame
print([df.dropna()])


[     0    1    2
1  2.0  3.0  5.0]


In [59]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2.0
1,5.0
2,6.0


In [65]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2.0,
1,2.0,3.0,5.0,
2,,4.0,6.0,


In [69]:
df.dropna(axis = 'columns', how = 'all') #drops a column with null value

Unnamed: 0,0,1,2
0,1.0,,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


In [71]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2.0,
1,2.0,3.0,5.0,
2,,4.0,6.0,


In [73]:
df.dropna(axis = 'rows', thresh = 3) #keeps only rows with a minimum of 3 non-null values)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5.0,


In [78]:
#Filling null values

data = pd.Series([1, np.nan, 2, None, 3], index = list ('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [82]:
data.fillna(0) #fill NA with a single value , e.g., zero

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [88]:
data.fillna(method='ffill')  # or method='bfill' #forward fill


  data.fillna(method='ffill')  # or method='bfill'


a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [90]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2.0,
1,2.0,3.0,5.0,
2,,4.0,6.0,


In [92]:
df.fillna(method = 'ffill', axis = 1)

  df.fillna(method = 'ffill', axis = 1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
