In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# Sample data with missing values, duplicates, and outliers
data = {
    'ID': [1, 2, 2, 3, 4, 5, 6, 7, 8, 9],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hank', 'Ivy'],
    'Age': [25, np.nan, np.nan, 35, 120, 28, 27, 29, 22, 1000],  # 120 and 1000 are outliers
    'Salary': [50000, 60000, 60000, 58000, 62000, np.nan, 56000, 59000, 61000, 1000000]  # 1,000,000 outlier
}

df = pd.DataFrame(data)

# 1. Remove duplicates based on all columns
df = df.drop_duplicates()

# 2. Handle missing values
# Fill missing 'Age' with median age
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing 'Salary' with mean salary
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

# 3. Detect and remove outliers using z-score (threshold 3)
z_scores = np.abs(stats.zscore(df[['Age', 'Salary']]))
df_clean = df[(z_scores < 3).all(axis=1)]

print("Cleaned DataFrame:")
print(df_clean)


Cleaned DataFrame:
   ID     Name     Age     Salary
0   1    Alice    25.0    50000.0
1   2      Bob    28.5    60000.0
3   3  Charlie    35.0    58000.0
4   4    David   120.0    62000.0
5   5      Eva    28.0   175750.0
6   6    Frank    27.0    56000.0
7   7    Grace    29.0    59000.0
8   8     Hank    22.0    61000.0
9   9      Ivy  1000.0  1000000.0


In [2]:
import pandas as pd
import numpy as np

# Sample skewed data (e.g., income)
data = {
    'Income': [20000, 22000, 25000, 27000, 30000, 35000, 1000000, 40000, 45000, 48000]
}
df = pd.DataFrame(data)

# Apply log transformation (adding 1 to avoid log(0))
df['Income_log'] = np.log1p(df['Income'])

print("Original Data:")
print(df['Income'])
print("\nLog-transformed Data:")
print(df['Income_log'])


Original Data:
0      20000
1      22000
2      25000
3      27000
4      30000
5      35000
6    1000000
7      40000
8      45000
9      48000
Name: Income, dtype: int64

Log-transformed Data:
0     9.903538
1     9.998843
2    10.126671
3    10.203629
4    10.308986
5    10.463132
6    13.815512
7    10.596660
8    10.714440
9    10.778977
Name: Income_log, dtype: float64


In [3]:
import pandas as pd

# Sample data with existing features
data = {
    'Age': [25, 32, 47, 51, 62],
    'Annual_Income': [50000, 60000, 80000, 90000, 120000],
    'Years_at_Job': [1, 3, 10, 6, 12]
}
df = pd.DataFrame(data)

# Create new feature: Income per Year at Job
df['Income_per_Year_at_Job'] = df['Annual_Income'] / df['Years_at_Job']

print(df)


   Age  Annual_Income  Years_at_Job  Income_per_Year_at_Job
0   25          50000             1                 50000.0
1   32          60000             3                 20000.0
2   47          80000            10                  8000.0
3   51          90000             6                 15000.0
4   62         120000            12                 10000.0


In [4]:
import pandas as pd
import numpy as np

# Sample data
data = {'Age': [22, 25, 27, 30, 120, 26, 28, 24, 27, 29]}
df = pd.DataFrame(data)

# Calculate Z-scores
df['z_score'] = (df['Age'] - df['Age'].mean()) / df['Age'].std()

# Define threshold for outliers
threshold = 3

# Identify outliers
outliers = df[np.abs(df['z_score']) > threshold]

# Remove outliers
df_no_outliers = df[np.abs(df['z_score']) <= threshold].drop(columns='z_score')

print("Outliers detected:")
print(outliers)

print("\nData without outliers:")
print(df_no_outliers)


Outliers detected:
Empty DataFrame
Columns: [Age, z_score]
Index: []

Data without outliers:
   Age
0   22
1   25
2   27
3   30
4  120
5   26
6   28
7   24
8   27
9   29


In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Sample data with missing values
data = {
    'Feature1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Feature2': [5.0, 4.0, 3.0, np.nan, 1.0],
    'Feature3': [np.nan, 2.0, 3.0, 4.0, 5.0]
}
df = pd.DataFrame(data)

# Initialize KNN Imputer with 2 neighbors
imputer = KNNImputer(n_neighbors=2)

# Fit and transform the data to impute missing values
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_imputed)


   Feature1  Feature2  Feature3
0       1.0       5.0       2.5
1       2.0       4.0       2.0
2       3.0       3.0       3.0
3       4.0       2.0       4.0
4       5.0       1.0       5.0
