In [18]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Load data
df = pd.read_csv("../Datum/data2.csv")

In [20]:
# Introduct some NaN values in 'salary' and 'department'
df.loc[2, "salary"] = np.nan
df.loc[5, "department"] = np.nan

In [21]:
# Introduce a duplicate row
df = pd.concat([df, df.iloc[[0]]], ignore_index=True)
df

Unnamed: 0,age,salary,department,gender,target
0,25,50000.0,HR,M,0
1,32,60000.0,IT,F,1
2,47,,IT,M,1
3,51,90000.0,Finance,F,0
4,23,48000.0,HR,F,0
5,44,75000.0,,M,1
6,36,62000.0,IT,M,0
7,52,95000.0,Finance,F,1
8,29,52000.0,HR,F,0
9,41,70000.0,IT,M,1


In [22]:
# Separate features and target
X = df.drop("target", axis=1)
y = df["target"]

In [23]:
# Preprocessing pipeline
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "str"]).columns

In [24]:
# Define imputers
numeric_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

In [25]:
# Apply imputers
df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

In [26]:
# Drop duplicate rows
df = df.drop_duplicates()
df

Unnamed: 0,age,salary,department,gender,target
0,25.0,50000.0,HR,M,0
1,32.0,60000.0,IT,F,1
2,47.0,65200.0,IT,M,1
3,51.0,90000.0,Finance,F,0
4,23.0,48000.0,HR,F,0
5,44.0,75000.0,HR,M,1
6,36.0,62000.0,IT,M,0
7,52.0,95000.0,Finance,F,1
8,29.0,52000.0,HR,F,0
9,41.0,70000.0,IT,M,1


In [27]:
# Detecting Outliers

# Calculate z-scores
z_scores = np.abs(stats.zscore(df[numeric_features]))

# Only keep rows with z-score < 3 for all numeric features
df = df[(z_scores < 3).all(axis=1)]
df

Unnamed: 0,age,salary,department,gender,target
0,25.0,50000.0,HR,M,0
1,32.0,60000.0,IT,F,1
2,47.0,65200.0,IT,M,1
3,51.0,90000.0,Finance,F,0
4,23.0,48000.0,HR,F,0
5,44.0,75000.0,HR,M,1
6,36.0,62000.0,IT,M,0
7,52.0,95000.0,Finance,F,1
8,29.0,52000.0,HR,F,0
9,41.0,70000.0,IT,M,1


In [28]:
# Re-run Preprocessing Pipeline

# Separate features and target
X = df.drop("target", axis=1)
y = df['target']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [29]:
X_train_scaled[:5]

array([[ 0.39764809,  0.35257801,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ],
       [-1.4339431 , -1.22143097,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ],
       [ 1.16884438,  1.61178519,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.6868467 , -0.26443351,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ],
       [ 0.10844948,  0.03777622,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ]])

In [30]:
# Adding New Features

# Feature 1: salary per age
df["salary_per_age"] = df["salary"] / df["age"]

# Feature 2: seniority flag
df["is_senior"] = df["age"].apply(lambda x: 1 if x > 40 else 0)

# Feature 3: combined categorical feature
df["dept_gender"] = df["department"] + "_" + df["gender"]

In [31]:
df.head()

Unnamed: 0,age,salary,department,gender,target,salary_per_age,is_senior,dept_gender
0,25.0,50000.0,HR,M,0,2000.0,0,HR_M
1,32.0,60000.0,IT,F,1,1875.0,0,IT_F
2,47.0,65200.0,IT,M,1,1387.234043,1,IT_M
3,51.0,90000.0,Finance,F,0,1764.705882,1,Finance_F
4,23.0,48000.0,HR,F,0,2086.956522,0,HR_F
