In [None]:
# !pip install boruta
# !pip install pandas
# !pip install numpy
# !pip install sklearn

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

In [3]:
#Loading data
df = pd.read_csv('data.csv', encoding='utf-8', engine='python', on_bad_lines='warn')

In [4]:
#Checking if 'pretermalg' column exists
if 'pretermalg' in df.columns:
    y = df['pretermalg']
else:
    print("Column 'pretermalg' not found. Check your dataset columns.")
    print("Available columns:", df.columns)

In [14]:
y

0         2.0
1         2.0
2         2.0
3         2.0
4         2.0
         ... 
148984    2.0
148985    2.0
148986    2.0
148987    2.0
148988    2.0
Name: pretermalg, Length: 148989, dtype: float64

In [10]:
# Separate categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

# Split data into training and testing sets to avoid data leakage in target encoding
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Apply Target Encoding to categorical data
te = TargetEncoder()
X_train[categorical_cols] = te.fit_transform(X_train[categorical_cols], y_train)
X_test[categorical_cols] = te.transform(X_test[categorical_cols])

# Combine encoded categorical data with numerical data
X_train_combined = pd.concat([X_train[categorical_cols], X_train[numerical_cols]], axis=1)
X_test_combined = pd.concat([X_test[categorical_cols], X_test[numerical_cols]], axis=1)

# Now X_train_combined and X_test_combined can be used for further analysis/modeling

In [13]:
categorical_cols

Index(['fid', 'infantid', 'usga', 'ustiming', 'schyears', 'priorcsec',
       'priorcsecnum', 'ancvisits', 'cstime', 'obslabor', 'antehem',
       'transverse', 'oblique', 'breech', 'induction', 'infdeliv', 'inffu',
       'unplanhosp', 'seizures', 'mantibiotics', 'corticosteroid', 'oxytocics',
       'bldtrans', 'dcsuction', 'magsulfate', 'hysterectomy', 'episiotomy',
       'jaundice', 'cpap', 'medeyecare', 'vitk', 'water30min', 'mheight_r2',
       'mweight_r2', 'bmi_r2', 'delivmode_r2', 'hemoglobin1_r2'],
      dtype='object')

In [12]:
numerical_cols

Index(['bord', 'site', 'cid', 'mnhyear', 'mn00ver', 'mn01ver', 'mn02ver',
       'mn03ver', 'mn05ver', 'mn06ver',
       ...
       'fuout', 'ltfdeliv', 'ltffu', 'ltf', 'clusterclosed', 'clusterresized',
       'closedenrl', 'medu_r2', 'meducat_r2', 'prevlb_r2'],
      dtype='object', length=107)