In [19]:
import numpy as np
import pandas as pd

In [16]:
a = np.array([1,2,3,None,5])
a

array([1, 2, 3, None, 5], dtype=object)

In [17]:
#Any mathematical operation on None results in error
a.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [None]:
#Lets add a number to None
None + 3

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

# Numpy and Nan

In [None]:
a = np.array([1,2,3,np.nan,5])
a

array([ 1.,  2.,  3., nan,  5.])

In [None]:
#Any mathematical operation on None results in nan
print(a.sum())

nan


In [None]:
#Numpy does provide some special aggregations that will ignore these missing values
print(np.nansum(a),np.nanmin(a),np.nanmax(a),np.nanmedian(a))

11.0 1.0 5.0 2.5


# Identifying Null

In [None]:
data_with_missing_val = pd.Series([1,np.nan,'hello',None])
print(data_with_missing_val.isnull())

0    False
1     True
2    False
3     True
dtype: bool


In [None]:
#obtaining values which are not null
data_with_missing_val[data_with_missing_val.notnull()]

0        1
2    hello
dtype: object

In [20]:
# dictionary of lists
dict_ = {
    'First' : [90,90,np.nan,95],
    'Second' : [33,45,56,np.nan],
    'Third' : [np.nan,45,80,98],
    'Fourth' : [50,60,60,70],
    'Fifth' : [33,45,56,None]
}

# Create a dataframe from dictionary of lists
df = pd.DataFrame(dict_)

# use isnull() function to check missing data
df.isnull()


Unnamed: 0,First,Second,Third,Fourth,Fifth
0,False,False,True,False,False
1,False,False,False,False,False
2,True,False,False,False,False
3,False,True,False,False,True


# Working with NaN

In [None]:
df['First'].notna()

0     True
1     True
2    False
3     True
Name: First, dtype: bool

In [None]:
df.fillna(0,inplace = True)
df

Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,0.0,50,33.0
1,90.0,45.0,45.0,60,45.0
2,0.0,56.0,80.0,60,56.0
3,95.0,0.0,98.0,70,0.0


In [None]:
df = pd.DataFrame(dict_)
print(df)

df.ffill()

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,,50,33.0
1,90.0,45.0,45.0,60,45.0
2,90.0,56.0,80.0,60,56.0
3,95.0,56.0,98.0,70,56.0


In [None]:
df = pd.DataFrame(dict_)
print(df)


   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


In [None]:
df.replace(to_replace = np.nan, value = -99, inplace = True)
df

Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,-99.0,50,33.0
1,90.0,45.0,45.0,60,45.0
2,-99.0,56.0,80.0,60,56.0
3,95.0,-99.0,98.0,70,-99.0


In [None]:
df = pd.DataFrame(dict_)
print(df)

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


In [None]:
df.dropna(inplace = True)
print('After Dropping')
df

After Dropping


Unnamed: 0,First,Second,Third,Fourth,Fifth
1,90.0,45.0,45.0,60,45.0


In [None]:
df = pd.DataFrame(dict_)
print(df)

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


In [None]:
df.fillna(df.mean(),inplace=True)
df

Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,74.333333,50,33.0
1,90.0,45.0,45.0,60,45.0
2,91.666667,56.0,80.0,60,56.0
3,95.0,44.666667,98.0,70,44.666667


In [22]:
df = pd.DataFrame(dict_)
print(df)

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


In [21]:
means = df['First'].mean()
print(means)

91.66666666666667


In [23]:
filled = df['First'].fillna(means)
df['First'] = filled

df

Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,,50,33.0
1,90.0,45.0,45.0,60,45.0
2,91.666667,56.0,80.0,60,56.0
3,95.0,,98.0,70,


In [24]:
df = pd.DataFrame(dict_)
print(df)

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


In [25]:
df.fillna(df.mean()['Third' : 'Fifth'])

Unnamed: 0,First,Second,Third,Fourth,Fifth
0,90.0,33.0,74.333333,50,33.0
1,90.0,45.0,45.0,60,45.0
2,,56.0,80.0,60,56.0
3,95.0,,98.0,70,44.666667


# Imputation using KNN

In [26]:
nan = np.nan
X = [
    [1,2,nan],
    [3,4,3],
    [nan,6,5],
    [8,8,7]
]

In [28]:
from sklearn.impute import KNNImputer

In [29]:
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

# Drop Demo

In [30]:
import pandas as pd
import numpy as np

# 1. Create a sample dataset with missing values (NaN)
data = {
    'Name' : ['Alice','Bob','Charlie','David','Eve'],
    'Age' : [25,np.nan,30,35,np.nan],
    'City' : ['New York','LA',np.nan,'Chicago','Miami'],
    'Salary' : [50000,60000,70000,np.nan,80000],
    'Score' : [np.nan,np.nan,np.nan,np.nan,np.nan]
}

df = pd.DataFrame(data)
print("--- Original DataFrame ---")
print(df)

--- Original DataFrame ---
      Name   Age      City   Salary  Score
0    Alice  25.0  New York  50000.0    NaN
1      Bob   NaN        LA  60000.0    NaN
2  Charlie  30.0       NaN  70000.0    NaN
3    David  35.0   Chicago      NaN    NaN
4      Eve   NaN     Miami  80000.0    NaN


In [None]:
df_rows_dropped = df.dropna()

In [None]:
df_rows_dropped

Unnamed: 0,Name,Age,City,Salary,Score


In [31]:
# #. Column Drop : Remove columns that have ANY missing values
df_cols_dropped = df.dropna(axis = 1)
df_cols_dropped

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David
4,Eve


In [None]:
df_subset_drop = df.dropna(subset=['Age','City'])
# Lok for nans in those columns, then drop the rows
df_subset_drop

Unnamed: 0,Name,Age,City,Salary,Score
0,Alice,25.0,New York,50000.0,
3,David,35.0,Chicago,,


In [None]:
limit = len(df) * 0.4
df_thresh = df.dropna(axis = 1,thresh = limit)

print("\n--- After Dropping columns with > 60% Missing Data ---")
print(df_thresh)


--- After Dropping columns with > 60% Missing Data ---
      Name   Age      City   Salary
0    Alice  25.0  New York  50000.0
1      Bob   NaN        LA  60000.0
2  Charlie  30.0       NaN  70000.0
3    David  35.0   Chicago      NaN
4      Eve   NaN     Miami  80000.0


In [None]:
data = {
    'Math' : [90,85,70,np.nan,95],
    'Science' : [88,np.nan,82,98,92],
    'English' : [95,80,np.nan,85,88]
}

df = pd.DataFrame(data)

print("--- Original DataFrame ---")
print(df)
print("\n" + "="*40 + "\n")

--- Original DataFrame ---
   Math  Science  English
0  90.0     88.0     95.0
1  85.0      NaN     80.0
2  70.0     82.0      NaN
3   NaN     98.0     85.0
4  95.0     92.0     88.0




In [None]:
pairwise_corr = df.corr()
print("--- Correlation Matrix (Pairwise Deletion) ---")
print(pairwise_corr)

--- Correlation Matrix (Pairwise Deletion) ---
             Math   Science   English
Math     1.000000  0.976221  0.532939
Science  0.976221  1.000000 -0.942112
English  0.532939 -0.942112  1.000000


In [None]:
listwise_df = df.dropna()
listwise_corr = listwise_df.corr()