## Handling Missing values

In [2]:
#Missing Values
import pandas as pd

# Example DataFrame
data = {'A': [1, 2, None, 4], 'B': [None, 2, 3, 4], 'C': [1, None, None, 4]}
df = pd.DataFrame(data)

# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.0  NaN
3  4.0  4.0  4.0


In [4]:
# Dropping rows with any missing values
df_cleaned_rows = df.dropna()
print("\nDataFrame after dropping rows with any missing values:")
print(df_cleaned_rows)


DataFrame after dropping rows with any missing values:
     A    B    C
3  4.0  4.0  4.0


In [6]:
# Dropping columns with any missing values
df_cleaned_columns = df.dropna(axis=1)
print("\nDataFrame after dropping columns with any missing values:")
print(df_cleaned_columns)


DataFrame after dropping columns with any missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


In [8]:
# Filling missing values with a specific value (e.g., 0)
df_filled = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_filled)



DataFrame after filling missing values with 0:
     A    B    C
0  1.0  0.0  1.0
1  2.0  2.0  0.0
2  0.0  3.0  0.0
3  4.0  4.0  4.0


In [10]:
# Filling missing values with the mean of the column
df_filled_mean = df.fillna(df.mean())
print("\nDataFrame after filling missing values with the mean of the column:")
print(df_filled_mean)


DataFrame after filling missing values with the mean of the column:
          A    B    C
0  1.000000  3.0  1.0
1  2.000000  2.0  2.5
2  2.333333  3.0  2.5
3  4.000000  4.0  4.0


In [12]:
# Handling Duplicate Rows
import pandas as pd

# Example DataFrame
data = {'A': [1, 2, 2, 4], 'B': [1, 2, 2, 4], 'C': [1, 2, 2, 4]}
df = pd.DataFrame(data)

In [14]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
   A  B  C
0  1  1  1
1  2  2  2
2  2  2  2
3  4  4  4


In [16]:
# Identifying duplicate rows
duplicate_rows = df[df.duplicated()]
print("\nDuplicate Rows:")
print(duplicate_rows)


Duplicate Rows:
   A  B  C
2  2  2  2


In [18]:
# Removing duplicate rows
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:")
print(df_no_duplicates)


DataFrame after removing duplicate rows:
   A  B  C
0  1  1  1
1  2  2  2
3  4  4  4


## Handling Outliers

Identifying Outliers Before handling outliers, you need to identify them. Common methods include:

Z-Score: Data points with a Z-score greater than 3 or less than -3 are often considered outliers.

IQR (Interquartile Range): Data points that fall below 𝑄 1 − 1.5 × 𝐼 𝑄 𝑅 Q1−1.5×IQR or above 𝑄 3 + 1.5 × 𝐼 𝑄 𝑅 Q3+1.5×IQR are considered outliers.

Visualization: Use box plots, scatter plots.

In [24]:
import pandas as pd
import numpy as np
from scipy import stats


In [26]:
# Example DataFrame
data = {'A': [1, 2, 3, 1000], 'B': [2, 3, 4, 5]}
df = pd.DataFrame(data)

# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
      A  B
0     1  2
1     2  3
2     3  4
3  1000  5


In [28]:
# Detecting outliers using the Z-score
z_scores = np.abs(stats.zscore(df)) #The function np.abs() is used to get the absolute values of the Z-scores
print("\nZ-scores for each data point:")
print(z_scores)


Z-scores for each data point:
          A         B
0  0.579664  1.341641
1  0.577349  0.447214
2  0.575035  0.447214
3  1.732048  1.341641


In [30]:
# Removing outliers (where Z-score > 3)
df_no_outliers = df[(z_scores < 3).all(axis=1)]
print("\nDataFrame after removing outliers:")
print(df_no_outliers)


DataFrame after removing outliers:
      A  B
0     1  2
1     2  3
2     3  4
3  1000  5


Interpreting Z-scores:
Z = 0: The data point is exactly at the mean.

Z > 0: The data point is above the mean.

Z < 0: The data point is below the mean.

Z = 1: The data point is one standard deviation above the mean.

Z = -1: The data point is one standard deviation below the mean.

Z > 2 or Z < -2: The data point is relatively far from the mean, and could be considered unusual.

Z > 3 or Z < -3: The data point is very far from the mean, and is typically considered an outlier.

In [32]:
#IQR
import pandas as pd

# Example DataFrame
data = {'A': [10, 12, 14, 16, 18, 20, 1000], 'B': [5, 6, 7, 8, 9, 10, 500]}
df = pd.DataFrame(data)

In [34]:
# Identify outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

In [36]:
# Filtering out outliers
df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]



In [38]:
# Original DataFrame
print("Original DataFrame:")
print(df)

# DataFrame after removing outliers
print("\nDataFrame after removing outliers:")
print(df_no_outliers)

Original DataFrame:
      A    B
0    10    5
1    12    6
2    14    7
3    16    8
4    18    9
5    20   10
6  1000  500

DataFrame after removing outliers:
    A   B
0  10   5
1  12   6
2  14   7
3  16   8
4  18   9
5  20  10
