## Here are how to deal with missing data with python

### 1. detect

In [None]:
df.isna()
df.isnull()


### 2. Remove (Option 1)

In [None]:
df.dropna()  # Remove rows with any missing values
df.dropna(axis=1)  # Remove columns with any missing values
df.dropna(how='all')  # Remove rows where all elements are missing
df.dropna(thresh=2)  # Remove rows with at least 2 non-NA values


### 3. Fill (option 2)

In [None]:
df.fillna(0)  # Fill all NA values with 0
df.fillna(method='ffill')  # Forward fill
df.fillna(method='bfill')  # Backward fill

df['column'].fillna(df['column'].mean(), inplace=True)  # Fill with mean
df['column'].fillna(df['column'].median(), inplace=True)  # Fill with median
df['column'].fillna(df['column'].mode()[0], inplace=True)  # Fill with mode


### 4. replace (option 3)

In [None]:
df.replace(to_replace=np.nan, value=0)  # Replace NaN with 0


### 5. interpolate (option 4)

In [None]:
df.interpolate()  # Default is linear interpolation
df.interpolate(method='polynomial', order=2)  # Polynomial interpolation

In [1]:
import pandas as pd
import numpy as np

# Create a DataFrame with missing values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 2, 3, 4],
    'C': [1, 2, 3, np.nan]
}
df = pd.DataFrame(data)

# Detect missing data
print(df.isna())


       A      B      C
0  False   True  False
1  False  False  False
2   True  False  False
3  False  False   True


In [2]:
# Remove rows with any missing values
df_dropped = df.dropna()
print(df_dropped)

     A    B    C
1  2.0  2.0  2.0


In [3]:
# Fill missing values with 0
df_filled = df.fillna(0)
print(df_filled)

     A    B    C
0  1.0  0.0  1.0
1  2.0  2.0  2.0
2  0.0  3.0  3.0
3  4.0  4.0  0.0


In [4]:
# Forward fill missing values
df_ffill = df.fillna(method='ffill')
print(df_ffill)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  2.0
2  2.0  3.0  3.0
3  4.0  4.0  3.0


In [5]:
# Interpolate missing values
df_interpolated = df.interpolate()
print(df_interpolated)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  2.0
2  3.0  3.0  3.0
3  4.0  4.0  3.0


In [6]:
# Fill missing values in column 'A' with the mean
df['A'].fillna(df['A'].mean(), inplace=True)
print(df)

          A    B    C
0  1.000000  NaN  1.0
1  2.000000  2.0  2.0
2  2.333333  3.0  3.0
3  4.000000  4.0  NaN
