In [1]:
import pandas as pd
from io import StringIO

# 1. Sample CSV data with missing values
csv_data = """
Name,Age,Salary
Alice,25,50000
Bob,,60000
Charlie,30,
David,22,45000
Eve,28,55000
"""

# 2. Load the data into a DataFrame
df = pd.read_csv(StringIO(csv_data))

# 3. Check for missing values (True/False mask)
missing_mask = df.isnull()

# 4. Summarize missing values count per column
missing_summary = df.isnull().sum()

print("Missing Values Mask:\n", missing_mask)
print("\nMissing Values Summary:\n", missing_summary)


Missing Values Mask:
     Name    Age  Salary
0  False  False   False
1  False   True   False
2  False  False    True
3  False  False   False
4  False  False   False

Missing Values Summary:
 Name      0
Age       1
Salary    1
dtype: int64


In [2]:
import pandas as pd
from io import StringIO

# Sample CSV data with missing values
csv_data = """
Name,Age,Salary
Alice,25,50000
Bob,,60000
Charlie,30,
David,22,45000
Eve,28,55000
"""

# Load the data into a DataFrame
df = pd.read_csv(StringIO(csv_data))

# Drop rows with any missing values
df_cleaned = df.dropna()

print(df_cleaned)


    Name   Age   Salary
0  Alice  25.0  50000.0
3  David  22.0  45000.0
4    Eve  28.0  55000.0


In [3]:
import pandas as pd
from io import StringIO

# Sample CSV data with missing values
csv_data = """
Name,Age,Salary
Alice,25,50000
Bob,,60000
Charlie,30,
David,22,45000
Eve,28,55000
"""

# Load the data into a DataFrame
df = pd.read_csv(StringIO(csv_data))

# Drop columns with any missing values
df_cleaned = df.dropna(axis=1)

print(df_cleaned)


      Name
0    Alice
1      Bob
2  Charlie
3    David
4      Eve


In [4]:
import pandas as pd
from io import StringIO

csv_data = """
Name,Age,Salary
Alice,25,50000
Bob,,60000
Charlie,30,
David,22,45000
Eve,28,55000
"""

df = pd.read_csv(StringIO(csv_data))

# Calculate mean of 'Age' column excluding missing values
age_mean = df['Age'].mean()

# Fill missing values in 'Age' with the mean
df['Age'] = df['Age'].fillna(age_mean)

print(df)


      Name    Age   Salary
0    Alice  25.00  50000.0
1      Bob  26.25  60000.0
2  Charlie  30.00      NaN
3    David  22.00  45000.0
4      Eve  28.00  55000.0


In [5]:
import pandas as pd
from io import StringIO

csv_data = """
Name,Department,Salary
Alice,HR,50000
Bob,,60000
Charlie,IT,55000
David,HR,45000
Eve,,52000
"""

df = pd.read_csv(StringIO(csv_data))

# Calculate mode of 'Department' column
dept_mode = df['Department'].mode()[0]

# Fill missing values in 'Department' with the mode
df['Department'] = df['Department'].fillna(dept_mode)

print(df)











      Name Department  Salary
0    Alice         HR   50000
1      Bob         HR   60000
2  Charlie         IT   55000
3    David         HR   45000
4      Eve         HR   52000


In [6]:
import pandas as pd
from io import StringIO

csv_data = """
Product,Price
A,10
B,15
C,200
D,
E,12
F,14
G,
"""

df = pd.read_csv(StringIO(csv_data))

# Calculate median of 'Price' column (ignoring NaN)
price_median = df['Price'].median()

# Fill missing values in 'Price' with median
df['Price'] = df['Price'].fillna(price_median)

print(df)


  Product  Price
0       A   10.0
1       B   15.0
2       C  200.0
3       D   14.0
4       E   12.0
5       F   14.0
6       G   14.0


In [7]:
import pandas as pd
from sklearn.impute import KNNImputer

# Sample data with missing values
data = {
    'Feature1': [1.0, 2.0, 3.0, None, 5.0],
    'Feature2': [5.0, None, 1.0, 2.0, 4.0],
    'Feature3': [None, 1.0, 2.0, 3.0, 4.0]
}
df = pd.DataFrame(data)

# Initialize KNNImputer with k=2 neighbors
imputer = KNNImputer(n_neighbors=2)

# Perform imputation
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_imputed)


   Feature1  Feature2  Feature3
0       1.0       5.0       2.5
1       2.0       3.0       1.0
2       3.0       1.0       2.0
3       4.0       2.0       3.0
4       5.0       4.0       4.0


In [8]:
import pandas as pd

# Sample categorical data with missing values
data = {
    'Category': ['apple', 'banana', None, 'banana', 'apple', None, 'orange', 'banana']
}
df = pd.DataFrame(data)

# Identify missing values
missing_mask = df['Category'].isnull()
print("Missing values detected:\n", missing_mask)

# Find the most frequent category (mode)
mode_value = df['Category'].mode()[0]

# Fill missing values with the most frequent category
df['Category_filled'] = df['Category'].fillna(mode_value)

print(df)


Missing values detected:
 0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
Name: Category, dtype: bool
  Category Category_filled
0    apple           apple
1   banana          banana
2     None          banana
3   banana          banana
4    apple           apple
5     None          banana
6   orange          orange
7   banana          banana


In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data with missing values in 'target' column
data = {
    'feature1': [1, 2, 3, 4, 5, 6, 7],
    'feature2': [7, 6, 5, 4, 3, 2, 1],
    'target': [10, 15, None, 25, None, 35, 40]
}
df = pd.DataFrame(data)

# Split data into train (non-missing target) and predict (missing target)
train_df = df[df['target'].notna()]
predict_df = df[df['target'].isna()]

# Train regression model on non-missing rows
model = LinearRegression()
X_train = train_df[['feature1', 'feature2']]
y_train = train_df['target']
model.fit(X_train, y_train)

# Predict missing target values
X_predict = predict_df[['feature1', 'feature2']]
predicted_values = model.predict(X_predict)

# Impute missing values with predictions
df.loc[df['target'].isna(), 'target'] = predicted_values

print(df)


   feature1  feature2  target
0         1         7    10.0
1         2         6    15.0
2         3         5    20.0
3         4         4    25.0
4         5         3    30.0
5         6         2    35.0
6         7         1    40.0


In [10]:
import pandas as pd
import numpy as np

# Sample time series data with missing values
data = {
    'date': pd.date_range(start='2023-01-01', periods=7, freq='D'),
    'value': [10, np.nan, np.nan, 40, np.nan, 60, np.nan]
}
df = pd.DataFrame(data)

# Step 1: Sort by date (if not already sorted)
df = df.sort_values('date')

# Step 2: Forward fill missing values
df['value_ffill'] = df['value'].fillna(method='ffill')

# Step 3: Backward fill missing values
df['value_bfill'] = df['value'].fillna(method='bfill')

print(df)










        date  value  value_ffill  value_bfill
0 2023-01-01   10.0         10.0         10.0
1 2023-01-02    NaN         10.0         40.0
2 2023-01-03    NaN         10.0         40.0
3 2023-01-04   40.0         40.0         40.0
4 2023-01-05    NaN         40.0         60.0
5 2023-01-06   60.0         60.0         60.0
6 2023-01-07    NaN         60.0          NaN


  df['value_ffill'] = df['value'].fillna(method='ffill')
  df['value_bfill'] = df['value'].fillna(method='bfill')
