In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('processed_data_final.csv')

In [2]:
# Separate features and target
X = data.drop(columns=['JobSatisfaction_O'])
y = data['JobSatisfaction_O']

In [15]:
# Identify columns by category
numerical_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'float64']
numerical_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'float64']
binary_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'int64']
binary_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'int64']


Count of numerical fixed columns: 11
Count of numerical actionable columns: 8
Count of binary fixed columns: 339
Count of binary actionable columns: 205
Total count of columns: 563


In [16]:
# Organize binary actionable columns by their respective questions
binary_actionable_groups = {}
for col in binary_actionable_columns:
    question_prefix = '_'.join(col.split('_')[:-2])
    if question_prefix not in binary_actionable_groups:
        binary_actionable_groups[question_prefix] = []
    binary_actionable_groups[question_prefix].append(col)

In [18]:
# Normalize the numerical columns
scaler = StandardScaler()
X[numerical_fixed_columns + numerical_actionable_columns] = scaler.fit_transform(X[numerical_fixed_columns + numerical_actionable_columns])

In [19]:

preprocessing_info = {
    'numerical_fixed_columns': numerical_fixed_columns,
    'numerical_actionable_columns': numerical_actionable_columns,
    'binary_fixed_columns': binary_fixed_columns,
    'binary_actionable_groups': binary_actionable_groups,
    'scaler': scaler
}