In [None]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values
import pandas as pd
import numpy as np
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', None],
    'Age': [28, 34, None, 41, 37],
    'City': ['New York', None, 'Boston', 'Chicago', 'Miami'],
    'Salary': [75000, 65000, 80000, None, 90000]
}
df = pd.DataFrame(data)
df.to_csv('sample_data.csv', index=False)
print("Sample CSV file created with the following data:")
print(df)
print("\n")
df = pd.DataFrame(pd.read_csv('sample_data.csv'))
print("Data loaded from CSV:")
print(df)
print("\n")
print("Missing values (True indicates missing):")
print(df.isnull())
print("\n")
print("Count of missing values in each column:")
print(df.isnull().sum())
print("\n")
print("Total missing values in the DataFrame:", df.isnull().sum().sum())
print("\n")
print("Percentage of missing values in each column:")
print(df.isnull().mean() * 100)
print("\n")
print("Rows with at least one missing value:")
print(df[df.isnull().any(axis=1)])
print("\n")
print("Rows with complete data:")
print(df[~df.isnull().any(axis=1)])

In [None]:
#corrected  Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28],
    'Salary': [50000, 60000, np.nan, 70000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_cleaned = df.dropna()
print("\nDataFrame After Dropping Rows with Missing Values:")
print(df_cleaned)

In [None]:
# corrected Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28],
    'Salary': [50000, 60000, np.nan, 70000],
    'Department': [None, None, None, None]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_cleaned = df.dropna(axis=1)
print("\nDataFrame After Dropping Columns with Missing Values:")
print(df_cleaned)

In [None]:
# correted Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)
print("\nDataFrame After Mean Imputation:")
print(df)

In [None]:
# correted Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', np.nan, 'IT', 'HR']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
mode_value = df['Department'].mode()[0]  
df['Department'] = df['Department'].fillna(mode_value)
print("\nDataFrame After Mode Imputation:")
print(df)0

In [None]:
# correted Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.
import pandas as pd
import numpy as np
data = {
    'Income': [50000, 60000, 200000, 250000, np.nan, 80000, np.nan],
    'Age': [25, 30, 35, 40, 28, np.nan, 30]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
income_median = df['Income'].median()
df['Income'] = df['Income'].fillna(income_median)
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)
print("\nDataFrame After Median Imputation:")
print(df)

In [None]:
#  corrected Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
data = {
    'Age': [25, np.nan, 30, 35, 40],
    'Salary': [50000, 60000, np.nan, 80000, 85000],
    'Experience': [2, 5, 10, 7, 12]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)
imputer = KNNImputer(n_neighbors=2)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("\nDataFrame After KNN Imputation:")
print(df_imputed)

In [None]:
#correted  Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.
import pandas as pd
import numpy as np
data = {
    'Category': ['A', 'B', 'A', np.nan, 'B', 'C', 'A', np.nan, 'C', 'C'],
    'Value': [10, 15, 10, 25, 30, 35, 40, 50, 60, 70]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
missing_categories = df['Category'].isnull()
print("\nMissing Values in 'Category' Column:")
print(missing_categories)
mode_value = df['Category'].mode()[0]
df['Category'] = df['Category'].fillna(mode_value)
print("\nDataFrame After Filling Missing Categorical Values:")
print(df)

In [None]:
# coreted Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
data = {
    'Age': [25, 30, 35, np.nan, 40, 45, np.nan, 50],
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000],
    'Experience': [2, 5, 8, 10, 12, 15, 17, 20]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)
train_df = df[df['Age'].notna()]  
test_df = df[df['Age'].isna()]  
X_train = train_df[['Salary', 'Experience']]  
y_train = train_df['Age']  
model = LinearRegression()
model.fit(X_train, y_train)
X_test = test_df[['Salary', 'Experience']]  
predicted_ages = model.predict(X_test)
df.loc[df['Age'].isna(), 'Age'] = predicted_ages
print("\nDataFrame After Predictive Imputation:")
print(df)

In [None]:
# correted Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.
import pandas as pd
import numpy as np
data = {
    'Date': pd.date_range(start='2025-01-01', periods=10, freq='D'),
    'Temperature': [22, np.nan, np.nan, 24, 25, np.nan, 27, np.nan, 28, 29]
}
df = pd.DataFrame(data)
print("Original Time Series DataFrame with Missing Values:")
print(df)
df = df.sort_values(by='Date')
df_ffill = df.copy()
df_ffill['Temperature'] = df_ffill['Temperature'].fillna(method='ffill')
df_bfill = df.copy()
df_bfill['Temperature'] = df_bfill['Temperature'].fillna(method='bfill')
print("\nTime Series After Forward Fill:")
print(df_ffill)
print("\nTime Series After Backward Fill:")
print(df_bfill)