In [52]:
import pandas as pd
import numpy as np
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer, calculate_kmo
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [53]:
# Set up file paths
os.chdir('/Users/nsusser/Desktop/Github/happyDB/')
# Load data 
input_path = 'dataframes/tests/gpt40-mini/ratings_matrix.csv'

results = pd.read_csv(input_path, nrows=1000)

# print the last row
print(results.tail(1))

     Unnamed: 0   hmid                          cleaned_hm  \
999         999  28673  \nDaughter committed to a college.   

     PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?  \
999                                                6.0                                                    

     PERMA_Engagement_the_speaker_became_absorbed_in_what_they_were_doing?  \
999                                                3.0                       

     PERMA_Positive_Emotion_the_speaker_felt_joyful?  \
999                                              6.0   

     PERMA_Negative_emotion_the_speaker_felt_anxious?  \
999                                               2.0   

     PERMA_Accomplishment_the_speaker_achieved_the_important_goals_they_set_for_themselves?  \
999                                                5.0                                        

     PERMA_Health_the_speaker_perceived_their_health_positively?  \
999             

In [54]:
items_path = 'dataframes/scales_clean.csv'
items = pd.read_csv(items_path)

# Clean and sanitize column names
items['Scale'] = items['Scale'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Dimension'] = items['Dimension'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Items'] = items['Items'].str.strip().str.replace(r"\s+", "_", regex=True)

# Create flattened column names
cols = [f"{scale}_{dimension}_{item}" for scale, dimension, item in zip(
    items['Scale'], items['Dimension'], items['Items']
)]

In [55]:
#subset df with cols
df_subset = results[cols]
print(df_subset.columns)

Index(['PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?',
       'PERMA_Engagement_the_speaker_became_absorbed_in_what_they_were_doing?',
       'PERMA_Positive_Emotion_the_speaker_felt_joyful?',
       'PERMA_Negative_emotion_the_speaker_felt_anxious?',
       'PERMA_Accomplishment_the_speaker_achieved_the_important_goals_they_set_for_themselves?',
       'PERMA_Health_the_speaker_perceived_their_health_positively?',
       'PERMA_Meaning_the_speaker_felt_their_life_was_purposeful_and_meaningful?',
       'PERMA_Relationships_the_speaker_received_help_and_support_from_others_when_needed?',
       'PERMA_Meaning_the_speaker_felt_what_they_did_in_life_was_valuable_and_worthwhile?',
       'PERMA_Engagement_the_speaker_felt_excited_and_interested_in_things?',
       ...
       'CIT_Optimism_the_speaker_expected_more_good_things_in_their_life_than_bad?',
       'CIT_Subjective_Well-Being_-_Life_Satisfaction_the_speaker_felt_in_most_ways_t

In [56]:
print(df_subset.head())
#standardize df_subset
scaler = StandardScaler()
df_subset = pd.DataFrame(scaler.fit_transform(df_subset), columns=df_subset.columns)

#calculate kmo and bartlett's test
kmo_all, kmo_model = calculate_kmo(df_subset)

#determine optimal number of factors
fa = FactorAnalyzer(rotation=None)
fa.fit(df_subset)
eigenvalues, _ = fa.get_eigenvalues()

#fit factor analysis with optimal factors
optimal_factors = sum(eigenvalues > 1) # or based on scree plot
fa = FactorAnalyzer(n_factors=optimal_factors, rotation='varimax')
fa.fit(df_subset)

#project cols onto factors
factor_scores = fa.transform(df_subset)
for i in range(factor_scores.shape[1]):
    results[f'Factor_{i+1}'] = factor_scores[:, i]




   PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?  \
0                                                6.0                                                    
1                                                6.0                                                    
2                                                6.0                                                    
3                                                6.0                                                    
4                                                3.0                                                    

   PERMA_Engagement_the_speaker_became_absorbed_in_what_they_were_doing?  \
0                                                5.0                       
1                                                2.0                       
2                                                4.0                       
3                                                4.0             



In [57]:

#save csv
results.to_csv('results 1000 - factors.csv')

In [58]:
for i in range(factor_loadings.shape[1]):
    sorted_loadings = factor_loadings.iloc[:, i].abs().sort_values(ascending=False)
    top_variables = sorted_loadings.head(10).index.tolist()
    print(f'Factor {i+1}: {top_variables}')

NameError: name 'factor_loadings' is not defined