# Global Constant

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Sample dataset with missing values
data = {
    'Name': ['Ali', 'Sara', 'John', 'Ayesha', 'Tom'],
    'Age': [25, np.nan, 30, np.nan, 40],
    'City': ['Lahore', 'Karachi', np.nan, 'Islamabad', np.nan]
}

df = pd.DataFrame(data)
print("Original Dataset:\n\n", df)

Original Dataset:

      Name   Age       City
0     Ali  25.0     Lahore
1    Sara   NaN    Karachi
2    John  30.0        NaN
3  Ayesha   NaN  Islamabad
4     Tom  40.0        NaN


In [None]:
# df.isnull() → returns a DataFrame of True/False (True = value is NaN).
print("\n\nCell Wise :\n\n",df.isnull())



Cell Wise :

     Name    Age   City
0  False  False  False
1  False   True  False
2  False  False   True
3  False   True  False
4  False  False   True


In [None]:
# .sum() → sums True values column-wise (counts nulls per column).
print("\n\nColumn Wise null values :\n\n",df.isnull().sum())



Column Wise null values :

 Name    0
Age     2
City    2
dtype: int64


In [None]:

# Another .sum() → adds them all together → total nulls in the whole dataset
print("\n\nTotal null values in the data set :\n\n",df.isnull().sum().sum())



Total null values in the data set :

 4


In [None]:
# Check number of missing values in each row
print("\n\nNumber of missing values in each row : \n\n",df.isnull().sum(axis=1))




Number of missing values in each row : 

 0    0
1    1
2    1
3    1
4    1
dtype: int64


In [None]:
# output only those rows that contain any missing values
print("\n\nMissing values exists in a row or not : \n\n",df[df.isnull().any(axis=1)])



Missing values exists in a row or not : 

      Name   Age       City
1    Sara   NaN    Karachi
2    John  30.0        NaN
3  Ayesha   NaN  Islamabad
4     Tom  40.0        NaN


In [None]:

# Show percentage of missing values per column
print("\n\n percentage of missing value per column : \n\n",(df.isnull().sum() / len(df)) * 100)




 percentage of missing value per column : 

 Name     0.0
Age     40.0
City    40.0
dtype: float64


# Fill missing values with a global constant

In [None]:
global_constant = "Unknown"
df_filled = df.fillna(global_constant)

print("\nDataset After Filling Missing Values with Global Constant:\n", df_filled)



Dataset After Filling Missing Values with Global Constant:
      Name      Age       City
0     Ali     25.0     Lahore
1    Sara  Unknown    Karachi
2    John     30.0    Unknown
3  Ayesha  Unknown  Islamabad
4     Tom     40.0    Unknown


# Filling with mean median and mode

In [None]:
# Sample dataset with missing values
data = {
    'Name': ['Ali', 'Sara', 'John', 'Ayesha', 'Tom'],
    'Age': [25, np.nan, 30, np.nan, 40],
    'Salary': [50000, 60000, np.nan, 55000, np.nan],
    'City': ['Lahore', 'Karachi', np.nan, 'Karachi', 'Lahore']
}

# data = {
#     'Name': ['Ali', 'Ali', 'John', 'John', 'Tom'],
#     'Age': [25, 25, 30, np.nan, 30],
#     'Salary': [55000, 60000, 60000, 55000, np.nan],
#     'City': ['Lahore', 'Karachi', np.nan, 'Karachi', 'Lahore']
# }



df = pd.DataFrame(data)
print("Original Dataset:\n", df)

Original Dataset:
      Name   Age   Salary     City
0     Ali  25.0  50000.0   Lahore
1    Sara   NaN  60000.0  Karachi
2    John  30.0      NaN      NaN
3  Ayesha   NaN  55000.0  Karachi
4     Tom  40.0      NaN   Lahore


In [None]:
# -------------------------
# 1. Fill with Mean (for numeric columns)
# -------------------------
df_mean = df.copy()
df_mean['Age'].fillna(df_mean['Age'].mean())
df_mean['Salary'].fillna(df_mean['Salary'].mean())
print("\nAfter Filling with Mean:\n", df_mean)


After Filling with Mean:
      Name   Age   Salary       City
0     Ali  25.0  50000.0     Lahore
1    Sara   NaN  60000.0    Karachi
2    John  30.0      NaN        NaN
3  Ayesha   NaN  55000.0  Islamabad
4     Tom  40.0      NaN     Lahore


In [None]:
# -------------------------
# 2. Fill with Median (for numeric columns)
# -------------------------
df_median = df.copy()
df_median['Age'].fillna(df_median['Age'].median())
df_median['Salary'].fillna(df_median['Salary'].median())
print("\nAfter Filling with Median:\n", df_median)



After Filling with Median:
      Name   Age   Salary       City
0     Ali  25.0  50000.0     Lahore
1    Sara   NaN  60000.0    Karachi
2    John  30.0      NaN        NaN
3  Ayesha   NaN  55000.0  Islamabad
4     Tom  40.0      NaN     Lahore


In [None]:
df_mode = df.copy()
df_mode.columns

Index(['Name', 'Age', 'Salary', 'City'], dtype='object')

In [None]:
print(df["City"].mode())

0    Karachi
1     Lahore
Name: City, dtype: object


In [None]:



# -------------------------
# 3. Fill with Mode (for both numeric & categorical columns)
# -------------------------

#.mode(): calculates the mode — i.e., the most common value(s) in that column. (It can return multiple values
# if there’s a tie.)

# [0]: selects the first mode (in case there’s more than one).

df_mode = df.copy()
for column in df_mode.columns:
  if(df[column].isnull().sum()>0):
    df_mode[column].fillna(df_mode[column].mode()[1], inplace=True)

# -------------------------
# Show results
# -------------------------


print("\nAfter Filling with Mode:\n", df_mode)



After Filling with Mode:
      Name   Age   Salary     City
0     Ali  25.0  50000.0   Lahore
1    Sara  30.0  60000.0  Karachi
2    John  30.0  55000.0   Lahore
3  Ayesha  30.0  55000.0  Karachi
4     Tom  40.0  55000.0   Lahore


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mode[column].fillna(df_mode[column].mode()[1], inplace=True)


# Filling with mean median and mode by Class

In [None]:
# 1. df_mean.groupby('Gender') = Groups the rows of the DataFrame based on the 'Gender' column. So, for example,
# if you have two genders: 'Male' and 'Female', this will create two groups — one for each gender.

# 2. ['Age'] = After grouping, this selects only the 'Age' column within each group. So now we’re focusing on the
# ages of males separately from females.

# 3. .transform(lambda x: x.fillna(x.mean()))
# transform() applies a function to each group and returns a series that has the same shape (same number of rows) as the
# original column. Inside transform(), we define a lambda function (an inline function).

# 4. lambda x: x.fillna(x.mean())
# x represents the 'Age' values within one group (e.g., all ages of males).
# x.mean() calculates the mean age of that group.
# x.fillna(x.mean()) fills all missing (NaN) values in that group with the group’s mean age.

# 5. In simple words:
# For each gender, replace missing ages with the average age of that gender.

In [None]:
# Sample dataset with missing values
data = {
    'Name': ['Ali', 'Sara', 'John', 'Ayesha', 'Tom', 'Sophia'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Age': [25, np.nan, 30, 55, 40, 35],
    'Salary': [50000, 60000, np.nan, 55000, np.nan, 65000]
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)


Original Dataset:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female   NaN  60000.0
2    John    Male  30.0      NaN
3  Ayesha  Female  55.0  55000.0
4     Tom    Male  40.0      NaN
5  Sophia  Female  35.0  65000.0


In [None]:


# -------------------------
# Fill missing values by class (Gender)
# -------------------------
# 1. Fill with Mean by class
df_mean = df.copy()
df_mean['Age'] = df_mean.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mean()))
df_mean['Salary'] = df_mean.groupby('Gender')['Salary'].transform(lambda x: x.fillna(x.mean()))
print("\nAfter Filling with Mean by Class:\n", df_mean)




After Filling with Mean by Class:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female  45.0  60000.0
2    John    Male  30.0  50000.0
3  Ayesha  Female  55.0  55000.0
4     Tom    Male  40.0  50000.0
5  Sophia  Female  35.0  65000.0


In [None]:




# 2. Fill with Median by class
df_median = df.copy()
df_median['Age'] = df_median.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.median()))
df_median['Salary'] = df_median.groupby('Gender')['Salary'].transform(lambda x: x.fillna(x.median()))

# 3. Fill with Mode by class
df_mode = df.copy()
df_mode['Age'] = df_mode.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mode()[0]))
df_mode['Salary'] = df_mode.groupby('Gender')['Salary'].transform(lambda x: x.fillna(x.mode()[0]))

# -------------------------
# Show results
# -------------------------

print("\nAfter Filling with Median by Class:\n", df_median)
print("\nAfter Filling with Mode by Class:\n", df_mode)


Original Dataset:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female   NaN  60000.0
2    John    Male  30.0      NaN
3  Ayesha  Female   NaN  55000.0
4     Tom    Male  40.0      NaN
5  Sophia  Female  35.0  65000.0

After Filling with Mean by Class:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female  35.0  60000.0
2    John    Male  30.0  50000.0
3  Ayesha  Female  35.0  55000.0
4     Tom    Male  40.0  50000.0
5  Sophia  Female  35.0  65000.0

After Filling with Median by Class:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female  35.0  60000.0
2    John    Male  30.0  50000.0
3  Ayesha  Female  35.0  55000.0
4     Tom    Male  40.0  50000.0
5  Sophia  Female  35.0  65000.0

After Filling with Mode by Class:
      Name  Gender   Age   Salary
0     Ali    Male  25.0  50000.0
1    Sara  Female  35.0  60000.0
2    John    Male  30.0  50000.0
3  Ayesha  Female  35.0  55000.0
4     Tom    

# fill missing values using machine learning

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Sample dataset with missing values
data = {
    'Age': [25, 30, 35, 40, np.nan, 50, 55, np.nan],
    'Experience': [1, 3, 5, 7, 9, 11, 13, 15],
    'Salary': [20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000]
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)

Original Dataset:
     Age  Experience  Salary
0  25.0           1   20000
1  30.0           3   30000
2  35.0           5   40000
3  40.0           7   50000
4   NaN           9   60000
5  50.0          11   70000
6  55.0          13   80000
7   NaN          15   90000


In [None]:
df['Age'].notnull()

Unnamed: 0,Age
0,True
1,True
2,True
3,True
4,False
5,True
6,True
7,False


In [None]:
# -------------------------
# Fill missing 'Age' using Linear Regression
# -------------------------

# Split dataset into rows with and without missing Age
train_data =df[ df['Age'].notnull()]
test_data = df[df['Age'].isnull()]

# Define predictors (X) and target (y)
X_train = train_data[['Experience', 'Salary']]
y_train = train_data['Age']

X_test = test_data[['Experience', 'Salary']]

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing values
predicted_values = model.predict(X_test)

# Fill missing values

# 1. df['Age'].isnull() = This checks which rows in the ‘Age’ column have missing (NaN) values. It returns a Boolean
# Series — meaning it gives True where Age is missing and False where it’s not.

# 2. df.loc[ ... , 'Age'] = .loc[] is a label-based indexer — it lets you access (or modify) specific rows and columns.
# Inside .loc[], the part before the comma selects which rows, and the part after the comma selects which column.
# means: “Select all rows where Age is missing, and focus on the Age column.”

df.loc[df['Age'].isnull(), 'Age'] = predicted_values

print("\nDataset After Filling Missing Values with Linear Regression:\n", df)



Dataset After Filling Missing Values with Linear Regression:
     Age  Experience  Salary
0  25.0           1   20000
1  30.0           3   30000
2  35.0           5   40000
3  40.0           7   50000
4  45.0           9   60000
5  50.0          11   70000
6  55.0          13   80000
7  60.0          15   90000


# fill missing values using interpolation

In [None]:
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8],
    'Temperature': [30, np.nan, 32, np.nan, np.nan, 35, 36, np.nan]
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)

# -------------------------
# Fill missing values using Interpolation
# -------------------------
# Linear interpolation (default)
# agar aik value miss hay to us ko fill kernay k liye us say ooper or neechay wali values lein gay, e.g. values at index 2,
# for x = 2, y = 30+((2-1/3-1)*(32-30))
# agar consecutive # 2, 3 ya zada missing hein to sab ko fill kernay k liye sab sy pehli or sab say akhri values fix
# ker dein gay or # missing values find ker lein gay e.g. values at index 4, 5 , to at index 4 and 5 ki missing values
# find kernay k liye (1,30) and (7,36) fix ker dein gay
# formula for interpolation : y = y1+((x-x1/x2-x1)*(y2-y1))
# for x = 4, y = 30+((4-1/7-1)*(36-30))
# for x = 5, y = 30+((5-1/7-1)*(36-30))
# or agar akhri value miss hay jaisay at index = 8 , to is mein index 8 say ooper vali value say fill ker dein gay

df_linear = df.copy()
df_linear['Temperature'] = df_linear['Temperature'].interpolate(method='linear')

# Polynomial interpolation (order=2, quadratic)
df_poly = df.copy()
df_poly['Temperature'] = df_poly['Temperature'].interpolate(method='polynomial', order=2)

# Time interpolation (if 'Day' were datetime index)
# df['Temperature'] = df['Temperature'].interpolate(method='time')

# -------------------------
# Show results
# -------------------------
print("\nAfter Linear Interpolation:\n", df_linear)
print("\nAfter Polynomial (order=2) Interpolation:\n", df_poly)


Original Dataset:
    Day  Temperature
0    1         30.0
1    2          NaN
2    3         32.0
3    4          NaN
4    5          NaN
5    6         35.0
6    7         36.0
7    8          NaN

After Linear Interpolation:
    Day  Temperature
0    1         30.0
1    2         31.0
2    3         32.0
3    4         33.0
4    5         34.0
5    6         35.0
6    7         36.0
7    8         36.0

After Polynomial (order=2) Interpolation:
    Day  Temperature
0    1         30.0
1    2         31.0
2    3         32.0
3    4         33.0
4    5         34.0
5    6         35.0
6    7         36.0
7    8          NaN


In [None]:
data = {
    'Day': [1, 2, 3, 4,5],
    'Temperature': [10, np.nan, np.nan, np.nan,40]
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)

df_linear = df.copy()
df_linear['Temperature'] = df_linear['Temperature'].interpolate(method='linear')
print(df_linear)

Original Dataset:
    Day  Temperature
0    1         10.0
1    2          NaN
2    3          NaN
3    4          NaN
4    5         40.0
   Day  Temperature
0    1         10.0
1    2         17.5
2    3         25.0
3    4         32.5
4    5         40.0


# Forward Fill (ffill) and Backward Fill (bfill)

In [None]:
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8],
    'Temperature': [30, np.nan, 32, np.nan, np.nan, 35, 36, np.nan]
}

df = pd.DataFrame(data)
print("Original Dataset:\n\n", df)

# Copies the previous non-null value downward
df_ffill = df.copy()
df_ffill.fillna(method='ffill', inplace=True)  # Forward fill
print("\n\nDataset after forward fill : \n\n",df_ffill)



# Copies the next non-null value upward
df_bfill = df.copy()
df_bfill.fillna(method='bfill', inplace=True)  # Backward fill
print("\n\nDataset after backward fill : \n\n",df_bfill)

Original Dataset:

    Day  Temperature
0    1         30.0
1    2          NaN
2    3         32.0
3    4          NaN
4    5          NaN
5    6         35.0
6    7         36.0
7    8          NaN


Dataset after forward fill : 

    Day  Temperature
0    1         30.0
1    2         30.0
2    3         32.0
3    4         32.0
4    5         32.0
5    6         35.0
6    7         36.0
7    8         36.0


Dataset after backward fill : 

    Day  Temperature
0    1         30.0
1    2         32.0
2    3         32.0
3    4         35.0
4    5         35.0
5    6         35.0
6    7         36.0
7    8          NaN


# Dropping Missing Values (not filling, but important alternative)

In [None]:
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8],
    'Temperature': [30, np.nan, 32, np.nan, np.nan, 35, 36, np.nan]
}


df = pd.DataFrame(data)
print("Original Dataset:\n\n", df)

# Drop rows with any NaN
df_drop_rows = df.copy()
df_drop_rows.dropna(inplace=True)
print("\n\nDataset after drop rows : \n\n",df_drop_rows)

# Drop columns with any NaN
df_drop_cols = df.copy()
df_drop_cols.dropna(axis=1, inplace=True)
print("\n\nDataset after drop cols : \n\n",df_drop_cols)


Original Dataset:

    Day  Temperature
0    1         30.0
1    2          NaN
2    3         32.0
3    4          NaN
4    5          NaN
5    6         35.0
6    7         36.0
7    8          NaN


Dataset after drop rows : 

    Day  Temperature
0    1         30.0
2    3         32.0
5    6         35.0
6    7         36.0


Dataset after drop cols : 

    Day
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
