In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('processed_data_final.csv')

In [2]:
# Separate features and target
X = data.drop(columns=['JobSatisfaction_O'])
y = data['JobSatisfaction_O']

In [15]:
# Identify columns by category
numerical_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'float64']
numerical_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'float64']
binary_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'int64']
binary_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'int64']


Count of numerical fixed columns: 11
Count of numerical actionable columns: 8
Count of binary fixed columns: 339
Count of binary actionable columns: 205
Total count of columns: 563


In [16]:
# Organize binary actionable columns by their respective questions
binary_actionable_groups = {}
for col in binary_actionable_columns:
    question_prefix = '_'.join(col.split('_')[:-2])
    if question_prefix not in binary_actionable_groups:
        binary_actionable_groups[question_prefix] = []
    binary_actionable_groups[question_prefix].append(col)

In [18]:
# Normalize the numerical columns
scaler = StandardScaler()
X[numerical_fixed_columns + numerical_actionable_columns] = scaler.fit_transform(X[numerical_fixed_columns + numerical_actionable_columns])

In [19]:

preprocessing_info = {
    'numerical_fixed_columns': numerical_fixed_columns,
    'numerical_actionable_columns': numerical_actionable_columns,
    'binary_fixed_columns': binary_fixed_columns,
    'binary_actionable_groups': binary_actionable_groups,
    'scaler': scaler
}

In [21]:
X.head()



Unnamed: 0,Age_F,LearningDataScienceTime_F,TimeSpentStudying_A,Tenure_F,LearningCategorySelftTaught_F,LearningCategoryOnlineCourses_F,LearningCategoryWork_F,LearningCategoryUniversity_F,LearningCategoryKaggle_F,LearningCategoryOther_F,...,JobFactorIndustry_Very Important_F,JobFactorLeaderReputation_Not important_F,JobFactorLeaderReputation_Somewhat important_F,JobFactorLeaderReputation_Very Important_F,JobFactorDiversity_Not important_F,JobFactorDiversity_Somewhat important_F,JobFactorDiversity_Very Important_F,JobFactorPublishingOpportunity_Not important_F,JobFactorPublishingOpportunity_Somewhat important_F,JobFactorPublishingOpportunity_Very Important_F
0,-0.693575,-0.240598,-0.065745,2.369833,-1.297792,-0.816639,4.906107,-0.603233,-0.431284,-0.16919,...,0,0,1,0,0,1,0,0,1,0
1,-0.213715,0.571729,-0.065745,-0.784747,-0.872042,0.323057,-0.664947,0.75377,2.547555,-0.16919,...,0,0,1,0,0,1,0,0,1,0
2,-0.405659,0.571729,-0.065745,-0.023297,-0.446292,1.082854,-0.664947,0.75377,-0.431284,-0.16919,...,1,0,0,1,0,0,1,0,0,1
3,2.281556,-0.240598,-0.065745,2.369833,-0.020541,-0.816639,1.563475,0.75377,-0.431284,-0.16919,...,0,0,1,0,0,1,0,0,1,0
4,0.554061,-0.240598,-0.065745,2.369833,1.25671,-0.626689,-0.386394,0.75377,-0.431284,-0.16919,...,0,0,1,0,0,1,0,0,1,0
