In [2]:
# Question: Advanced Data Cleaning with Multiple Issues
# Objective: Handle multiple issues in one dataset, including missing values, duplicates, and outliers.
# Description: Given a dataset with various data quality issues, employ multiple data cleaning techniques.




In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from scipy import stats

# Sample dataset with missing values, duplicates, and potential outliers
data = {
    'Age': [25, 30, 35, None, 45, None, 50, 1000],
    'Salary': [50000, 60000, None, 80000, None, 100000, 110000, 120000],
    'Gender': ['Male', 'Female', 'Female', 'Male', None, 'Female', 'Male', 'Male']
}

# Create DataFrame
df = pd.DataFrame(data)

# ---- Handling Missing Values ----
# Impute missing numerical values with KNN (K=2)
knn_imputer = KNNImputer(n_neighbors=2)
df[['Age', 'Salary']] = knn_imputer.fit_transform(df[['Age', 'Salary']])

# Impute missing categorical values with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

# ---- Handling Duplicates ----
# Remove duplicate rows
df.drop_duplicates(inplace=True)

# ---- Handling Outliers with Z-Scores ----
# Calculate Z-scores for numerical columns
z_scores = np.abs(stats.zscore(df[['Age', 'Salary']]))
outliers = (z_scores > 3)

# Replace outliers with the column mean (for simplicity)
df[['Age', 'Salary']] = df[['Age', 'Salary']].mask(outliers, df[['Age', 'Salary']].mean(), axis=1)

# ---- Log Transformation for Skewed Data ----
# Applying log transformation to Salary column
df['Salary'] = np.log(df['Salary'])

# ---- Feature Engineering ----
# Create a new feature called 'Age_to_Salary_Ratio' (Age / Salary)
df['Age_to_Salary_Ratio'] = df['Age'] / df['Salary']

# ---- Display the cleaned and transformed DataFrame ----
print("Cleaned and Transformed DataFrame:")
print(df)


Cleaned and Transformed DataFrame:
      Age     Salary  Gender  Age_to_Salary_Ratio
0    25.0  10.819778    Male             2.310583
1    30.0  11.002100  Female             2.726752
2    35.0  10.915088  Female             3.206570
3    27.5  11.289782    Male             2.435831
4    45.0  11.350407    Male             3.964616
5   525.0  11.512925  Female            45.600921
6    50.0  11.608236    Male             4.307287
7  1000.0  11.695247    Male            85.504821


In [4]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.



In [5]:
# Question: Feature Engineering by Creating New Features
# Objective: Create a new feature based on existing features to add predictive power.
# Description: Generate additional features from existing data to potentially improve the performance of
# prediction models.




In [6]:
# Question: Handling Complex Outliers with Z-Scores
# Objective: Detect and handle outliers using Z-score method.
# Description: Use the Z-score method to identify outliers which significantly differ from the rest of the data points.




In [7]:
# Question: Data Imputation with K-Nearest Neighbors (KNN)
# Objective: Impute missing numerical values using the KNN method.
# Description: Use the K-nearest neighbors algorithm to fill in missing values, which considers the values of
# nearest neighbors for imputation.


