In [14]:
import pandas as pd
import numpy as np
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer, calculate_kmo
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [27]:
# Set up file paths
os.chdir('/Users/nsusser/Desktop/Github/happyDB/')
# Load data 
input_path = 'dataframes/tests/gpt40-mini/merged_data_1000.csv'
reverse_coded_items_df = pd.read_csv('profiles/merged_reverse_coded_items_only.csv')
results = pd.read_csv(input_path, nrows=1000)
print(reverse_coded_items_df.head())

  Scale                                  Dimension  \
0   CIT                         Autonomy - Control   
1   CIT                         Autonomy - Control   
2   CIT                         Autonomy - Control   
3   CIT  Subjective Well-Being - Negative Feelings   
4   CIT  Subjective Well-Being - Negative Feelings   

                                               Items  
0  other people decided most of the speaker's lif...  
1  the life choices the speaker made were not rea...  
2  other people decided what the speaker could an...  
3        the speaker felt negative most of the time?  
4  the speaker experienced unhappy feelings most ...  


In [28]:
# Clean and sanitize column names
reverse_coded_items_df['Scale'] = reverse_coded_items_df['Scale'].str.strip().str.replace(r"\s+", "_", regex=True)
reverse_coded_items_df['Dimension'] = reverse_coded_items_df['Dimension'].str.strip().str.replace(r"\s+", "_", regex=True)
reverse_coded_items_df['Items'] = reverse_coded_items_df['Items'].str.strip().str.replace(r"\s+", "_", regex=True)
print(reverse_coded_items_df.head())

  Scale                                  Dimension  \
0   CIT                         Autonomy_-_Control   
1   CIT                         Autonomy_-_Control   
2   CIT                         Autonomy_-_Control   
3   CIT  Subjective_Well-Being_-_Negative_Feelings   
4   CIT  Subjective_Well-Being_-_Negative_Feelings   

                                               Items  
0  other_people_decided_most_of_the_speaker's_lif...  
1  the_life_choices_the_speaker_made_were_not_rea...  
2  other_people_decided_what_the_speaker_could_an...  
3        the_speaker_felt_negative_most_of_the_time?  
4  the_speaker_experienced_unhappy_feelings_most_...  


In [38]:
# Create flattened column names
reversed_columns = [f"{scale}_{dimension}_{item}" for scale, dimension, item in zip(
    reverse_coded_items_df['Scale'], reverse_coded_items_df['Dimension'], reverse_coded_items_df['Items']
)]

# Get the maximum value in the dataset
max_value = 7  # Max value in the dataset

# Reverse coding based on the flattened column names
for col in reversed_columns:
    print(f"Reversing {col}")
    results[col] = max_value + 1 - results[col]
    print(f"Reversed {col}")

Reversing CIT_Autonomy_-_Control_other_people_decided_most_of_the_speaker's_life_decisions?
Reversed CIT_Autonomy_-_Control_other_people_decided_most_of_the_speaker's_life_decisions?
Reversing CIT_Autonomy_-_Control_the_life_choices_the_speaker_made_were_not_really_theirs?
Reversed CIT_Autonomy_-_Control_the_life_choices_the_speaker_made_were_not_really_theirs?
Reversing CIT_Autonomy_-_Control_other_people_decided_what_the_speaker_could_and_could_not_do?
Reversed CIT_Autonomy_-_Control_other_people_decided_what_the_speaker_could_and_could_not_do?
Reversing CIT_Subjective_Well-Being_-_Negative_Feelings_the_speaker_felt_negative_most_of_the_time?
Reversed CIT_Subjective_Well-Being_-_Negative_Feelings_the_speaker_felt_negative_most_of_the_time?
Reversing CIT_Subjective_Well-Being_-_Negative_Feelings_the_speaker_experienced_unhappy_feelings_most_of_the_time?
Reversed CIT_Subjective_Well-Being_-_Negative_Feelings_the_speaker_experienced_unhappy_feelings_most_of_the_time?
Reversing CIT_Subje

In [39]:
items_path = 'dataframes/scales_clean.csv'
items = pd.read_csv(items_path)

# Clean and sanitize column names
items['Scale'] = items['Scale'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Dimension'] = items['Dimension'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Items'] = items['Items'].str.strip().str.replace(r"\s+", "_", regex=True)

# Create flattened column names
cols = [f"{scale}_{dimension}_{item}" for scale, dimension, item in zip(
    items['Scale'], items['Dimension'], items['Items']
)]

In [40]:
#subset df with cols
df_subset = results[cols]
print(df_subset.columns)

Index(['PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?',
       'PERMA_Engagement_the_speaker_became_absorbed_in_what_they_were_doing?',
       'PERMA_Positive_Emotion_the_speaker_felt_joyful?',
       'PERMA_Negative_emotion_the_speaker_felt_anxious?',
       'PERMA_Accomplishment_the_speaker_achieved_the_important_goals_they_set_for_themselves?',
       'PERMA_Health_the_speaker_perceived_their_health_positively?',
       'PERMA_Meaning_the_speaker_felt_their_life_was_purposeful_and_meaningful?',
       'PERMA_Relationships_the_speaker_received_help_and_support_from_others_when_needed?',
       'PERMA_Meaning_the_speaker_felt_what_they_did_in_life_was_valuable_and_worthwhile?',
       'PERMA_Engagement_the_speaker_felt_excited_and_interested_in_things?',
       ...
       'CIT_Optimism_the_speaker_expected_more_good_things_in_their_life_than_bad?',
       'CIT_Subjective_Well-Being_-_Life_Satisfaction_the_speaker_felt_in_most_ways_t

In [41]:
print(df_subset.head())
#standardize df_subset
scaler = StandardScaler()
df_subset = pd.DataFrame(scaler.fit_transform(df_subset), columns=df_subset.columns)

#calculate kmo and bartlett's test
kmo_all, kmo_model = calculate_kmo(df_subset)

#determine optimal number of factors
fa = FactorAnalyzer(rotation=None)
fa.fit(df_subset)
eigenvalues, _ = fa.get_eigenvalues()

#fit factor analysis with optimal factors
optimal_factors = sum(eigenvalues > 1) # or based on scree plot
fa = FactorAnalyzer(n_factors=optimal_factors, rotation='varimax')
fa.fit(df_subset)

#project cols onto factors
factor_scores = fa.transform(df_subset)
for i in range(factor_scores.shape[1]):
    results[f'Factor_{i+1}'] = factor_scores[:, i]




   PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?  \
0                                                6.0                                                    
1                                                6.0                                                    
2                                                6.0                                                    
3                                                6.0                                                    
4                                                3.0                                                    

   PERMA_Engagement_the_speaker_became_absorbed_in_what_they_were_doing?  \
0                                                5.0                       
1                                                2.0                       
2                                                4.0                       
3                                                4.0             



In [42]:

#save csv
results.to_csv('results 1000 - factors.csv')

In [44]:
#print top ten
variable_name_mapping = dict(zip(cols, items)) #map cols to variable names
factor_loadings = pd.DataFrame(fa.loadings_, index=items)


for i in range(factor_loadings.shape[1]):
    sorted_loadings = factor_loadings.iloc[:, i].abs().sort_values(ascending=False)
    top_variables = sorted_loadings.head(10).index.tolist()
    print(f'Factor {i+1}: {top_variables}')


# Create an empty list to store the results
factor_loading_results = []

# Iterate through each factor's loadings
for i in range(factor_loadings.shape[1]):
    sorted_loadings = factor_loadings.iloc[:, i].abs().sort_values(ascending=False)
    top_variables = sorted_loadings.head(10).index.tolist()
    top_loadings = sorted_loadings.head(10).values.tolist()
    
    # Append each factor's results to the list
    for var, loading in zip(top_variables, top_loadings):
        factor_loading_results.append({
            'Factor': f'Factor {i+1}',
            'Variable': var,
            'Loading': loading
        })

# Convert the results to a DataFrame
factor_loading_df = pd.DataFrame(factor_loading_results)

# Save to a CSV file
factor_loading_df.to_csv('factor_loadings.csv', index=False)

print("Factor loadings saved to 'factor_loadings.csv'.")


Factor 1: [('WBP', 'Competence', 'the_speaker_was_competent_and_capable_in_important_activities?'), ('CIT', 'Mastery_-_Self-Efficacy', 'the_speaker_succeeded_when_they_put_their_mind_to_it?'), ('CIT', 'Mastery_-_Self-Efficacy', 'the_speaker_believed_that_they_are_capable_in_most_things?'), ('WBP', 'Meaning', 'the_speaker_felt_a_sense_of_direction?'), ('WBP', 'Meaning', 'the_speaker_had_a_clear_sense_of_purpose?'), ('PWB', 'Purpose_in_Life', 'the_speaker_felt_a_sense_of_direction_and_purpose?'), ('CIT', 'Mastery_-_Accomplishment', 'the_speaker_felt_they_fulfilled_their_ambitions?'), ('CIT', 'Mastery_-_Skills', 'the_speaker_used_their_skills_a_lot_in_their_everyday_life?'), ('PANAS', 'Positive', 'the_speaker_was_determined?'), ('WBP', 'Clear_Thinking', 'the_speaker_stayed_focused_when_they_needed_to?')]
Factor 2: [('PERMA', 'Positive_Emotion', 'the_speaker_felt_joyful?'), ('WBP', 'Positive_Emotions', 'the_speaker_felt_cheerful?'), ('WBP', 'Optimism', 'the_speaker_felt_very_optimistic_in_

In [45]:

# Dictionary to store items by factors
factor_items = {}

# Iterate through each factor's loadings
for i in range(factor_loadings.shape[1]):
    sorted_loadings = factor_loadings.iloc[:, i].abs().sort_values(ascending=False)
    top_variables = sorted_loadings.index.tolist()  # Get all variables sorted by absolute loading
    
    # Add variables to the factor_items dictionary
    factor_items[f'Factor {i+1}'] = top_variables

    # Print top 10 variables for the current factor
    top_ten = top_variables[:10]
    print(f'Factor {i+1}: {top_ten}')

# Convert the results to a DataFrame for export (optional)
factor_loading_results = [
    {
        'Factor': factor, 
        'Variables': ', '.join([str(variable) for variable in variables])  # Convert tuples to strings
    }
    for factor, variables in factor_items.items()
]

# Convert to DataFrame and save to CSV
factor_loading_df = pd.DataFrame(factor_loading_results)
factor_loading_df.to_csv('factor_loadings_grouped.csv', index=False)

print("Factor loadings grouped by factor saved to 'factor_loadings_grouped.csv'.")

# Save the factor_items dictionary for visualization (optional)
with open('factor_items.json', 'w') as f:
    import json
    json.dump(factor_items, f)

print("Factor items saved to 'factor_items.json'.")


Factor 1: [('WBP', 'Competence', 'the_speaker_was_competent_and_capable_in_important_activities?'), ('CIT', 'Mastery_-_Self-Efficacy', 'the_speaker_succeeded_when_they_put_their_mind_to_it?'), ('CIT', 'Mastery_-_Self-Efficacy', 'the_speaker_believed_that_they_are_capable_in_most_things?'), ('WBP', 'Meaning', 'the_speaker_felt_a_sense_of_direction?'), ('WBP', 'Meaning', 'the_speaker_had_a_clear_sense_of_purpose?'), ('PWB', 'Purpose_in_Life', 'the_speaker_felt_a_sense_of_direction_and_purpose?'), ('CIT', 'Mastery_-_Accomplishment', 'the_speaker_felt_they_fulfilled_their_ambitions?'), ('CIT', 'Mastery_-_Skills', 'the_speaker_used_their_skills_a_lot_in_their_everyday_life?'), ('PANAS', 'Positive', 'the_speaker_was_determined?'), ('WBP', 'Clear_Thinking', 'the_speaker_stayed_focused_when_they_needed_to?')]
Factor 2: [('PERMA', 'Positive_Emotion', 'the_speaker_felt_joyful?'), ('WBP', 'Positive_Emotions', 'the_speaker_felt_cheerful?'), ('WBP', 'Optimism', 'the_speaker_felt_very_optimistic_in_