In [1]:
import pandas as pd
import numpy as np

# Target and Features selection

In [2]:
df = pd.read_pickle("../data/processed/df_pickle")
# df = pd.read_csv("../data/processed/full_df.csv", index_col=0)

In [6]:
target = "Attrition"  # nan values 
features = ["DistanceFromHome", "JobSatisfaction", "NumCompaniesWorked", "MonthlyIncome", "Gender", 
    "TotalWorkingYears", "WorkLifeBalance", "YearsInCurrentRole","YearlyIncome"]

In [14]:
X = df[features]
y = df[target]

num_attr = X.select_dtypes(include='number').columns
cat_attr = X.select_dtypes(exclude='number').columns

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# fill na with most frequent value
y_train = y_train.fillna(y_train.value_counts().index[0])
y_test = y_test.fillna(y_test.value_counts().index[0])


# do tego miejsca przeniesc do build_fetures

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from src.helping_functions import TrimOutliers


numeric_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='median')),
        ("trimer", TrimOutliers()),
        ("std_scaler", StandardScaler()),
    ]
)

category_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('ohe', OneHotEncoder())
    ]
)

cat_num_pipeline = ColumnTransformer(
    [
        ("numerical", numeric_pipeline, num_attr),
        ("categorical", category_pipeline, cat_attr)
    ]
)

