## 1. Setup and Data Loading

In [1]:
# Import required modules
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# EFA modules
from soc_opros_loader import SocOprosLoader
from efa_analyzer import EFAAnalyzer, FactorSolution
from factor_validator import FactorValidator
from efa_error_handling import get_warning_manager, configure_warnings

# Configure warnings
configure_warnings(show_warnings=True, level="WARNING")
warnings.filterwarnings('ignore', category=UserWarning, module='efa_analyzer')

print("‚úì Modules imported successfully")
print("‚úì EFA environment ready")

‚úì Modules imported successfully
‚úì EFA environment ready




In [2]:
# Load soc opros survey data
try:
    loader = SocOprosLoader()
    
    # Try to load real data first
    print("Attempting to load soc opros data from Google Sheets...")
    data = loader.load_data()
    responses_matrix = loader.get_responses_matrix()
    
    print(f"‚úì Real data loaded successfully")
    print(f"Data shape: {responses_matrix.shape} (statements √ó respondents)")
    
except Exception as e:
    print(f"Could not load real data: {e}")
    print("Using mock data for demonstration...")
    
    # Create mock data for demonstration
    from efa_integration_test import create_mock_soc_opros_data
    responses_matrix = create_mock_soc_opros_data()
    print(f"‚úì Mock data created: {responses_matrix.shape} (statements √ó respondents)")

# For EFA, we need observations √ó variables format
efa_data = responses_matrix.T  # Transpose to get respondents √ó statements
print(f"‚úì EFA data prepared: {efa_data.shape} (observations √ó variables)")

# Display basic statistics
print("\nData Overview:")
print(f"- Number of respondents (observations): {efa_data.shape[0]}")
print(f"- Number of statements (variables): {efa_data.shape[1]}")
print(f"- Missing data: {efa_data.isnull().sum().sum()} values ({(efa_data.isnull().sum().sum()/efa_data.size)*100:.1f}%)")
print(f"- Response range: {efa_data.min().min():.0f} to {efa_data.max().max():.0f}")

INFO:soc_opros_loader:Loading data from: https://docs.google.com/spreadsheets/d/17oJL-hVMqOehHFugKHDJBmGtbWkp7e1y4ccJFnxwapk/export?format=csv&gid=992488085


Attempting to load soc opros data from Google Sheets...


INFO:soc_opros_loader:Data loaded successfully with encoding: utf-8 - Shape: (265, 16)


INFO:soc_opros_loader:Column names: ['statements', '–ê–º–µ–ª–∏—è', '–ò—Ç–∞–Ω–∏–æ', '–û—Ç–µ—Ü', '–í–∞–∞–ª', '–ü–∏–∫', '–°—Ç–∏–ª—à–µ–π–¥', '–ü–ò–ü–ò–ü–ò', '–ê–π—Å–∞', '–ö–µ–ª—å', '–ò–µ–∑–µ–∫–∏–ª—å', '–ù–µ–ª–æ—Ç', '–ò–Ω–Ω–µ–∞–¥', '–î–∂–∞–∑–∞—Ä', '–ö–∞—è', '–ê–π—à–∞']


INFO:soc_opros_loader:Structure parsed - 265 statements, 15 respondents


INFO:soc_opros_loader:Filling 667 missing values with neutral response (3.0)


INFO:soc_opros_loader:Responses matrix created - Shape: (265, 15)


‚úì Real data loaded successfully
Data shape: (265, 15) (statements √ó respondents)
‚úì EFA data prepared: (15, 265) (observations √ó variables)

Data Overview:
- Number of respondents (observations): 15
- Number of statements (variables): 265
- Missing data: 0 values (0.0%)
- Response range: 1 to 5


## 2. Data Validation and Initial Assessment

In [3]:
# Initialize EFA analyzer and validator
efa_analyzer = EFAAnalyzer(
    n_factors=None,  # Auto-determine using eigenvalue > 1.0 criterion
    extraction_method='principal',
    rotation_method='oblimin'  # Default oblimin rotation
)

validator = FactorValidator()

print("‚úì EFA Analyzer initialized:")
print(f"  - Extraction method: {efa_analyzer.extraction_method}")
print(f"  - Rotation method: {efa_analyzer.rotation_method}")
print(f"  - Factor count: {'Auto-determine' if efa_analyzer.n_factors is None else efa_analyzer.n_factors}")

‚úì EFA Analyzer initialized:
  - Extraction method: principal
  - Rotation method: oblimin
  - Factor count: Auto-determine


In [4]:
# Perform data validation
print("Data Validation Results:")
print("=" * 40)

# Basic data validation
validation_results = efa_analyzer.validate_data(efa_data)

print(f"‚úì Data validity: {'VALID' if validation_results.is_valid else 'INVALID'}")

if validation_results.warnings:
    print("\nWarnings:")
    for i, warning in enumerate(validation_results.warnings, 1):
        print(f"  {i}. {warning}")
        
if validation_results.errors:
    print("\nErrors:")
    for i, error in enumerate(validation_results.errors, 1):
        print(f"  {i}. {error}")

# Data adequacy assessment
adequacy_results = validator.check_data_adequacy(efa_data)

print(f"\n‚úì Sample adequacy: {'ADEQUATE' if adequacy_results.is_adequate else 'INADEQUATE'}")
print(f"  - Sample size: {adequacy_results.sample_size}")
print(f"  - Variables: {adequacy_results.n_variables}")
print(f"  - Ratio: {adequacy_results.ratio:.2f} (recommended: ‚â•5.0)")

if adequacy_results.recommendations:
    print("\nRecommendations:")
    for rec in adequacy_results.recommendations:
        print(f"  ‚Ä¢ {rec}")

Data Validation Results:
‚úì Data validity: VALID

  1. More variables (265) than observations (15). Results may be unstable - consider dimensionality reduction.
  2. Small sample: 15 observations for 265 variables. Recommended: ‚â•1325
  3. factor_analyzer not available - install dependencies for full validation

‚úì Sample adequacy: INADEQUATE
  - Sample size: 15
  - Variables: 265
  - Ratio: 0.06 (recommended: ‚â•5.0)

Recommendations:
  ‚Ä¢ Increase sample size to at least 1325


## 2.1. Statistical Validation Tests

### KMO (Kaiser-Meyer-Olkin) Test
Measures sampling adequacy - should be ‚â• 0.6 for factor analysis

In [5]:
# KMO (Kaiser-Meyer-Olkin) Test
print("KMO Measure of Sampling Adequacy:")
print("=" * 40)

try:
    kmo_results = validator.calculate_kmo(efa_data)
    
    print(f"Overall KMO: {kmo_results['overall_kmo']:.3f}")
    print(f"Interpretation: {kmo_results['interpretation']}")
    print(f"Threshold: ‚â• {kmo_results['threshold']}")
    print(f"Test result: {'‚úì PASS' if kmo_results['is_adequate'] else '‚úó FAIL'}")
    
    # Show problematic variables
    individual_kmo = kmo_results['individual_kmo']
    low_kmo_vars = {var: kmo for var, kmo in individual_kmo.items() if kmo < 0.5}
    
    if low_kmo_vars:
        print(f"\nVariables with low KMO (< 0.5):")
        for var, kmo in sorted(low_kmo_vars.items(), key=lambda x: x[1])[:5]:
            print(f"  {var}: {kmo:.3f}")
    else:
        print(f"\n‚úì All variables have adequate individual KMO values")
    
    if kmo_results['recommendations']:
        print(f"\nRecommendations:")
        for rec in kmo_results['recommendations'][:3]:
            print(f"  ‚Ä¢ {rec}")
            
except Exception as e:
    print(f"KMO calculation failed: {e}")

print("\n" + "-" * 60)

KMO Measure of Sampling Adequacy:
Overall KMO: nan
Interpretation: Unacceptable - factor analysis not recommended
Threshold: ‚â• 0.6
Test result: ‚úó FAIL

Variables with low KMO (< 0.5):
  Life should have an end: 0.000
  You should do whatever you want regardless of consequences: 0.000
  It is good to be alive: 0.000
  Existing is good: 0.000
  You consider yourself happy: 0.000

Recommendations:
  ‚Ä¢ Consider removing variables with low individual KMO: Life should have an end, You should do whatever you want regardless of consequences, It is good to be alive, Existing is good, You consider yourself happy
  ‚Ä¢ Variables with very low KMO (< 0.3) should be removed: Life should have an end, You should do whatever you want regardless of consequences, It is good to be alive

------------------------------------------------------------


  partial_corr[i, j] = -corr_inv[i, j] / np.sqrt(corr_inv[i, i] * corr_inv[j, j])


In [6]:
# Bartlett's Test of Sphericity
print("Bartlett's Test of Sphericity:")
print("=" * 40)

try:
    bartlett_results = validator.calculate_bartlett_test(efa_data)
    
    print(f"Chi-square statistic: {bartlett_results['statistic']:.3f}")
    print(f"Degrees of freedom: {bartlett_results['degrees_of_freedom']}")
    print(f"P-value: {bartlett_results['p_value']:.6f}")
    print(f"Significance level: Œ± = {bartlett_results['alpha']}")
    print(f"Test result: {'‚úì PASS' if bartlett_results['is_significant'] else '‚úó FAIL'}")
    print(f"Interpretation: {bartlett_results['interpretation']}")
    
    if bartlett_results['recommendations']:
        print(f"\nRecommendations:")
        for rec in bartlett_results['recommendations'][:2]:
            print(f"  ‚Ä¢ {rec}")
            
except Exception as e:
    print(f"Bartlett's test failed: {e}")

print("\n" + "-" * 60)

Bartlett's Test of Sphericity:
Chi-square statistic: -inf
Degrees of freedom: 34980
P-value: 0.500000
Significance level: Œ± = 0.05
Test result: ‚úó FAIL
Interpretation: Not significant - correlation matrix may not differ from identity matrix (poor for factor analysis)

Recommendations:
  ‚Ä¢ Factor analysis may not be appropriate - variables appear uncorrelated
  ‚Ä¢ Consider checking data quality and variable selection

------------------------------------------------------------


  chi_square = -(n_obs - 1 - (2 * n_vars + 5) / 6) * np.log(det_corr)


In [7]:
# Enhanced Sample Size Assessment
print("Enhanced Sample Size Assessment:")
print("=" * 40)

try:
    adequacy_results = validator.check_enhanced_sample_adequacy(efa_data)
    
    print(f"Overall Assessment: {adequacy_results['overall_adequacy']}")
    print(f"Sample Size: {adequacy_results['sample_size']} observations")
    print(f"Variables: {adequacy_results['n_variables']}")
    print(f"Ratio: {adequacy_results['ratio']:.1f}:1 (obs:vars)")
    
    print(f"\nDetailed Assessment:")
    for assessment in adequacy_results['assessments']:
        print(f"  ‚Ä¢ {assessment}")
    
    if adequacy_results['recommendations']:
        print(f"\nRecommendations:")
        for rec in adequacy_results['recommendations'][:3]:
            print(f"  ‚Ä¢ {rec}")
            
except Exception as e:
    print(f"Sample adequacy check failed: {e}")

print("\n" + "-" * 60)

Enhanced Sample Size Assessment:
Overall Assessment: Inadequate
Sample Size: 15 observations
Variables: 265
Ratio: 0.1:1 (obs:vars)

Detailed Assessment:
  ‚Ä¢ Comrey & Lee (1992): Unacceptable
  ‚Ä¢ Obs:Var ratio (0.1:1): Inadequate
  ‚Ä¢ Factor stability: Very Low

Recommendations:
  ‚Ä¢ Critical: Sample size below minimum threshold - analysis not recommended
  ‚Ä¢ Critical: Too few observations per variable - increase sample size
  ‚Ä¢ Target sample size: 3975 observations for stable results

------------------------------------------------------------


In [8]:
# Correlation Matrix Quality Check
print("Correlation Matrix Quality Assessment:")
print("=" * 40)

try:
    correlation_matrix = efa_analyzer.calculate_correlation_matrix(efa_data, method='pearson')
    singularity_results = validator.check_correlation_singularity(correlation_matrix)
    
    print(f"Matrix Condition: {singularity_results['severity']}")
    print(f"Determinant: {singularity_results['determinant']:.2e}")
    print(f"Condition Number: {singularity_results['condition_number']:.2e}")
    
    print(f"Singular: {'Yes' if singularity_results['is_singular'] else 'No'}")
    print(f"Near-singular: {'Yes' if singularity_results['is_near_singular'] else 'No'}")
    
    if singularity_results['perfect_correlations']:
        print(f"\nPerfect correlations found ({len(singularity_results['perfect_correlations'])}):")
        for var1, var2, corr in singularity_results['perfect_correlations'][:3]:
            print(f"  ‚Ä¢ {var1} ‚Üî {var2}: r = {corr:.3f}")
    
    if singularity_results['high_correlations']:
        print(f"\nHigh correlations found ({len(singularity_results['high_correlations'])}):")
        for var1, var2, corr in singularity_results['high_correlations'][:3]:
            print(f"  ‚Ä¢ {var1} ‚Üî {var2}: r = {corr:.3f}")
    
    print(f"\nInterpretation: {singularity_results['interpretation']}")
    
    if singularity_results['recommendations']:
        print(f"Recommendations:")
        for rec in singularity_results['recommendations'][:3]:
            print(f"  ‚Ä¢ {rec}")
            
except Exception as e:
    print(f"Correlation matrix check failed: {e}")

print("\n" + "-" * 60)

Correlation Matrix Quality Assessment:
Correlation matrix check failed: Correlation matrix is singular (determinant=-0.00e+00). Remove perfectly correlated or constant variables.

------------------------------------------------------------


In [9]:
# Validation Summary
print("STATISTICAL VALIDATION SUMMARY:")
print("=" * 50)

validation_passed = 0
total_tests = 4

# Collect all validation results
try:
    kmo_pass = kmo_results['is_adequate']
    bartlett_pass = bartlett_results['is_significant'] 
    adequacy_pass = adequacy_results['overall_adequacy'] in ['Adequate', 'Good', 'Excellent']
    matrix_pass = not (singularity_results['is_singular'] or singularity_results['is_near_singular'])
    
    print(f"1. KMO Test: {'‚úì PASS' if kmo_pass else '‚úó FAIL'} (KMO = {kmo_results['overall_kmo']:.3f})")
    print(f"2. Bartlett's Test: {'‚úì PASS' if bartlett_pass else '‚úó FAIL'} (p = {bartlett_results['p_value']:.4f})")
    print(f"3. Sample Adequacy: {'‚úì PASS' if adequacy_pass else '‚úó FAIL'} ({adequacy_results['overall_adequacy']})")
    print(f"4. Matrix Quality: {'‚úì PASS' if matrix_pass else '‚úó FAIL'} ({singularity_results['severity']})")
    
    validation_passed = sum([kmo_pass, bartlett_pass, adequacy_pass, matrix_pass])
    
    print(f"\nOverall Validation: {validation_passed}/{total_tests} tests passed")
    
    if validation_passed >= 3:
        print("üéØ VALIDATION RESULT: GOOD - Factor analysis can proceed")
        proceed_recommendation = "Proceed with factor analysis"
    elif validation_passed >= 2:
        print("‚ö†Ô∏è  VALIDATION RESULT: MARGINAL - Proceed with caution")
        proceed_recommendation = "Factor analysis possible but interpret results carefully"
    else:
        print("‚ùå VALIDATION RESULT: POOR - Factor analysis not recommended")
        proceed_recommendation = "Improve data quality before proceeding"
    
    print(f"Recommendation: {proceed_recommendation}")
    
except:
    print("Could not generate validation summary - check individual test results above")

print("\n" + "=" * 50)

STATISTICAL VALIDATION SUMMARY:
Could not generate validation summary - check individual test results above



## 3. Correlation Matrix Calculation

In [10]:
# Calculate correlation matrix using Pearson correlations
print("Calculating Correlation Matrix:")
print("=" * 40)

correlation_matrix = efa_analyzer.calculate_correlation_matrix(efa_data, method="pearson")

print(f"‚úì Correlation matrix calculated: {correlation_matrix.shape} (variables √ó variables)")
print(f"  - Method: Pearson correlations with pairwise deletion")
print(f"  - Range: {correlation_matrix.min().min():.3f} to {correlation_matrix.max().max():.3f}")

# Display correlation matrix summary
print("\nCorrelation Matrix Summary:")
mask = ~np.eye(correlation_matrix.shape[0], dtype=bool)  # Exclude diagonal
off_diag_corrs = correlation_matrix.values[mask]

print(f"  - Mean absolute correlation: {np.abs(off_diag_corrs).mean():.3f}")
print(f"  - Max absolute correlation: {np.abs(off_diag_corrs).max():.3f}")
print(f"  - Correlations > 0.3: {(np.abs(off_diag_corrs) > 0.3).sum()} ({(np.abs(off_diag_corrs) > 0.3).mean()*100:.1f}%)")
print(f"  - Correlations > 0.7: {(np.abs(off_diag_corrs) > 0.7).sum()} ({(np.abs(off_diag_corrs) > 0.7).mean()*100:.1f}%)")

Calculating Correlation Matrix:


ValueError: Correlation matrix is singular (determinant=-0.00e+00). Remove perfectly correlated or constant variables.

## 4. Factor Analysis Execution

In [11]:
# Perform complete factor analysis
print("Performing Factor Analysis:")
print("=" * 40)

try:
    factor_solution = efa_analyzer.fit(efa_data)
    
    print(f"‚úì Factor analysis completed successfully")
    print(f"  - Extraction method: {factor_solution.extraction_method}")
    print(f"  - Rotation method: {factor_solution.rotation_method}")
    print(f"  - Number of factors extracted: {factor_solution.n_factors}")
    print(f"  - Total variance explained: {factor_solution.variance_explained['total_variance_explained']*100:.1f}%")
    
except Exception as e:
    print(f"‚ùå Factor analysis failed: {str(e)}")
    factor_solution = None

Performing Factor Analysis:
‚ùå Factor analysis failed: Correlation matrix is singular (determinant=-0.00e+00). Remove perfectly correlated or constant variables.


## 5. Results Analysis and Interpretation

In [12]:
if factor_solution is not None:
    print("Factor Analysis Results:")
    print("=" * 40)
    
    # Display eigenvalues and variance explained
    print("\nEigenvalues and Variance Explained:")
    for i, (eigenval, prop_var, cum_var) in enumerate(zip(
        factor_solution.variance_explained['eigenvalues'],
        factor_solution.variance_explained['proportion_variance'],
        factor_solution.variance_explained['cumulative_variance']
    )):
        print(f"  Factor {i+1}: Œª={eigenval:.3f}, Prop.Var={prop_var*100:.1f}%, Cum.Var={cum_var*100:.1f}%")
    
    # Display factor loadings
    print("\nFactor Loadings Matrix (first 10 variables):")
    display_loadings = factor_solution.loadings.head(10).round(3)
    print(display_loadings.to_string())
    
    # Highlight significant loadings (>0.4)
    print("\nSignificant Factor Loadings (|loading| > 0.4):")
    for factor in factor_solution.loadings.columns:
        significant = factor_solution.loadings[factor][np.abs(factor_solution.loadings[factor]) > 0.4]
        if len(significant) > 0:
            print(f"\n{factor}:")
            for var, loading in significant.sort_values(key=abs, ascending=False).items():
                print(f"  {var}: {loading:.3f}")
        else:
            print(f"\n{factor}: No significant loadings found")
    
    # Display communalities
    print("\nCommunalities (first 10 variables):")
    display_communalities = factor_solution.communalities.head(10).round(3)
    for var, comm in display_communalities.items():
        print(f"  {var}: {comm:.3f}")
    
    print(f"\nCommunalities Summary:")
    print(f"  - Mean: {factor_solution.communalities.mean():.3f}")
    print(f"  - Range: {factor_solution.communalities.min():.3f} to {factor_solution.communalities.max():.3f}")
    print(f"  - Variables with comm. > 0.5: {(factor_solution.communalities > 0.5).sum()}/{len(factor_solution.communalities)}")

## 6. Factor Interpretation

In [13]:
if factor_solution is not None:
    print("Factor Interpretation:")
    print("=" * 40)
    
    try:
        interpretation = efa_analyzer.get_factor_interpretation(factor_solution, loading_threshold=0.4)
        
        # Overall structure quality
        structure = interpretation['structure_summary']
        print(f"Simple Structure Quality: {structure['simple_structure_quality']}")
        print(f"Variables assigned to factors: {structure['assigned_variables']}/{structure['total_variables']}")
        print(f"Unassigned variables: {structure['unassigned_variables']}")
        
        # Factor-by-factor interpretation
        for factor_name, factor_info in interpretation['factor_interpretations'].items():
            print(f"\n{factor_name}:")
            print(f"  - Interpretation quality: {factor_info['interpretation_quality']}")
            print(f"  - Variables with significant loadings: {factor_info['n_significant']}")
            print(f"  - Maximum loading: {factor_info['max_loading']:.3f}")
            
            if factor_info['high_loading_variables']:
                print("  - High loading variables:")
                for var, loading in list(factor_info['high_loading_variables'].items())[:5]:  # Top 5
                    print(f"    ‚Ä¢ {var}: {loading:.3f}")
                    
    except Exception as e:
        print(f"Could not generate interpretation: {e}")

## 7. Factor Scores

In [14]:
if factor_solution is not None and factor_solution.factor_scores is not None:
    print("Factor Scores Summary:")
    print("=" * 40)
    
    scores = factor_solution.factor_scores
    
    print(f"Factor scores calculated for {len(scores)} observations")
    print(f"Method: Regression method (precise estimation)")
    
    print("\nFactor Score Statistics:")
    for factor in scores.columns:
        mean_score = scores[factor].mean()
        std_score = scores[factor].std()
        print(f"  {factor}: M={mean_score:.3f}, SD={std_score:.3f}")
    
    # Display first few factor scores
    print("\nFirst 5 Respondents' Factor Scores:")
    print(scores.head().round(3).to_string())
    
    # Factor score correlations (should be low for oblique rotation)
    print("\nFactor Score Correlations:")
    factor_corrs = scores.corr().round(3)
    print(factor_corrs.to_string())
else:
    print("Factor scores not available")

Factor scores not available


## 8. Summary and Next Steps

In [15]:
print("Basic EFA Analysis Summary:")
print("=" * 50)

if factor_solution is not None:
    success_criteria = {
        'factors_extracted': factor_solution.n_factors,
        'variance_explained': factor_solution.variance_explained['total_variance_explained'] * 100,
        'target_variance': 60.0,  # From success criteria SC-001
        'target_factors': (3, 8),  # From success criteria SC-001
        'factor_range_ok': 3 <= factor_solution.n_factors <= 8,
        'variance_ok': factor_solution.variance_explained['total_variance_explained'] >= 0.60
    }
    
    print(f"‚úì Number of factors: {success_criteria['factors_extracted']} (target: {success_criteria['target_factors'][0]}-{success_criteria['target_factors'][1]})")
    print(f"‚úì Variance explained: {success_criteria['variance_explained']:.1f}% (target: ‚â•{success_criteria['target_variance']}%)")
    print(f"‚úì Factor count within range: {'YES' if success_criteria['factor_range_ok'] else 'NO'}")
    print(f"‚úì Variance target met: {'YES' if success_criteria['variance_ok'] else 'NO'}")
    
    overall_success = success_criteria['factor_range_ok'] and success_criteria['variance_ok']
    print(f"\nüéØ MVP Success Criteria: {'PASSED ‚úì' if overall_success else 'PARTIAL ‚ö†Ô∏è'}")
    
    if overall_success:
        print("\nüéâ Basic Factor Discovery (US1) completed successfully!")
        print("Ready to proceed to User Story 2: Statistical Validation")
    else:
        print("\n‚ö†Ô∏è Some success criteria not met - review data quality and sample size")
else:
    print("‚ùå Factor analysis failed - check data quality and dependencies")

# Show any accumulated warnings
warning_mgr = get_warning_manager()
summary = warning_mgr.get_summary()
if summary['total'] > 0:
    print(f"\nWarnings generated: {summary['total']}")
    print(f"By level: {summary['by_level']}")

print("\nNext Steps:")
print("1. Install scipy and factor-analyzer for full functionality")
print("2. Run statistical validation (User Story 2)")
print("3. Create visualizations (User Story 3)")
print("4. Apply to real soc opros survey data")

Basic EFA Analysis Summary:
‚ùå Factor analysis failed - check data quality and dependencies

Next Steps:
1. Install scipy and factor-analyzer for full functionality
2. Run statistical validation (User Story 2)
3. Create visualizations (User Story 3)
4. Apply to real soc opros survey data
