In [None]:
"""
Factor Analysis and Visualization Script

This script performs Generalized Linear Model (GLM) analyses on specified datasets to study the impact of various factors 
(e.g., Literary Quality, General Quality, Genre, Gender) on dependent variables (e.g., Staging_z, PlotProgression_z, 
and CognitiveTension_z). It saves the results and visualizations in a structured folder hierarchy based on the execution date.

Key Features:
- Processes multiple datasets and performs GLM analysis for each dependent variable.
- Saves visualizations (e.g., plots) in separate folders organized by dependent variable and dataset.
- Generates a summary CSV file containing statistical results (AIC, BIC, Pseudo R-squared, coefficients).

Folder Structure:
- Root folder named `Results Factor Analysis_{execution_date}`.
- Subfolders for each dependent variable (e.g., `Plot Progression`).
- Dataset-specific folders within dependent variable folders (e.g., `5 segments`).

Dependencies:
- Python libraries: pandas, matplotlib, statsmodels, numpy.

Usage:
- Ensure datasets are formatted as CSV files and placed in the specified path.
- Modify the `datasets` list with the correct filenames and descriptive names.
- Run the script to generate results and visualizations.

Author: Maaike de Jongh
Date: 2025-01-06
"""

import os
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from datetime import datetime

# List of datasets and their descriptions
datasets = [
    {'filename': '5_segments.csv', 'name': '5 segments'},
    {'filename': '10_segments.csv', 'name': '10 segments'},
    {'filename': '1000_segments.csv', 'name': '1000 words'},
    {'filename': '500_segments.csv', 'name': '500 words'}
]

# List to collect results
results_summary = []

# Root directory based on execution date
execution_date = datetime.now().strftime('%Y-%m-%d')
root_directory = f'Results Factor Analysis_{execution_date}'
os.makedirs(root_directory, exist_ok=True)

# Mapping for clean names of dependent variables
dep_var_titles = {
    'Staging_z': 'Staging',
    'PlotProgression_z': 'Plot Progression',
    'CognitiveTension_z': 'Cognitive Tension'
}

# Define colormap and order for specific categories
category_colors = {
    'Low': '#1f77b4',    # Blauw
    'Medium': '#ff7f0e', # Oranje
    'High': '#2ca02c'    # Groen
}

category_order = {
    'literary_category': ['High', 'Medium', 'Low'],
    'general_category': ['High', 'Medium', 'Low']
}

def run_glm_and_save_results(df, formula, dependent_variable, metric_label, filename_suffix, name):
    try:
        glm_model = smf.glm(formula=formula, data=df, family=sm.families.Gaussian())
        results = glm_model.fit()

        # Save summary information
        coef_summary = results.summary2().tables[1]
        coef_dict = {
            'index': coef_summary.index.tolist(),
            'columns': coef_summary.columns.tolist(),
            'data': coef_summary.values.tolist()
        }
        results_summary.append({
            'Dataset': name,
            'Dependent_Variable': dependent_variable,
            'Metric': metric_label,
            'AIC': results.aic,
            'BIC': results.bic,
            'Pseudo_R2': 1 - results.deviance / results.null_deviance,
            'Coefficients': coef_dict
        })

        # Prepare predictions
        segment_values = np.linspace(df['Relative_Segment'].min(), df['Relative_Segment'].max(), 100)
        pred_data = None

        # Handle different factors
        category_column = None
        if 'literary_category' in formula or 'general_category' in formula:
            category_column = 'literary_category' if 'literary_category' in formula else 'general_category'
            categories = df[category_column].unique()

            # Apply custom order if available
            if category_column in category_order:
                categories = sorted(categories, key=lambda x: category_order[category_column].index(x))
            else:
                categories = sorted(categories)

            pred_data = pd.DataFrame({
                'Relative_Segment': np.tile(segment_values, len(categories)),
                category_column: np.repeat(categories, len(segment_values))
            })
        elif 'Literary_Quality' in formula:
            values = np.linspace(df['Literary_Quality'].min(), df['Literary_Quality'].max(), 3)
            pred_data = pd.DataFrame({
                'Relative_Segment': np.tile(segment_values, len(values)),
                'Literary_Quality': np.repeat(values, len(segment_values))
            })
        elif 'General_Quality' in formula:
            values = np.linspace(df['General_Quality'].min(), df['General_Quality'].max(), 3)
            pred_data = pd.DataFrame({
                'Relative_Segment': np.tile(segment_values, len(values)),
                'General_Quality': np.repeat(values, len(segment_values))
            })
        elif 'Gender' in formula or 'genre' in formula:
            if 'genre * Gender' in formula:
                genres = df['genre'].unique()
                genders = df['Gender'].unique()
                pred_data = pd.DataFrame({
                    'Relative_Segment': np.tile(segment_values, len(genres) * len(genders)),
                    'genre': np.repeat(genres, len(segment_values) * len(genders)),
                    'Gender': np.tile(np.repeat(genders, len(segment_values)), len(genres))
                })
            else:
                category_column = 'Gender' if 'Gender' in formula else 'genre'
                categories = df[category_column].unique()
                categories = sorted(categories)  # Sort dynamically
                pred_data = pd.DataFrame({
                    'Relative_Segment': np.tile(segment_values, len(categories)),
                    category_column: np.repeat(categories, len(segment_values))
                })

        if pred_data is not None:
            pred_data[f'{dependent_variable}_pred'] = results.predict(pred_data)
            

            # Plot and save
            if 'genre * Gender' in formula:
                # Use FacetGrid for interaction plots
                g = sns.FacetGrid(pred_data, row='genre', col='Gender', margin_titles=True, despine=False)
                g.map_dataframe(sns.lineplot, x='Relative_Segment', y=f'{dependent_variable}_pred', color="b")

                g.set_axis_labels("Relative Segment", dep_var_titles.get(dependent_variable, dependent_variable))
                g.set_titles(row_template="Genre: {row_name}", col_template="Gender: {col_name}")

                # Highlight grid lines and axis ticks for clarity
                for ax in g.axes.flat:
                    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

                # Save the facet grid
                dependent_var_dir = os.path.join(root_directory, dep_var_titles.get(dependent_variable, dependent_variable))
                dataset_dir = os.path.join(dependent_var_dir, name)
                os.makedirs(dataset_dir, exist_ok=True)

                png_filename = os.path.join(dataset_dir, f'{dep_var_titles.get(dependent_variable, dependent_variable)}_{name}_{metric_label.replace(" ", "")}_facetgrid.png')
                g.savefig(png_filename, dpi=300)
                plt.close()
                print(f"FacetGrid graph saved in {png_filename}")

            else:
                fig, ax = plt.subplots(figsize=(10, 6))
                unique_values = np.unique(pred_data.iloc[:, 1])

                # Determine order and colors dynamically
                if category_column in category_order:  # Fixed order for literary_category and general_category
                    unique_values = sorted(unique_values, key=lambda x: category_order[category_column].index(x))
                else:  # Dynamic order for other factors
                    unique_values = sorted(unique_values)

                for idx, value in enumerate(unique_values):
                    subset = pred_data[pred_data.iloc[:, 1] == value]

                    # Assign color dynamically
                    if category_column in category_order:  # Predefined colors for literary_category/general_category
                        color = category_colors.get(value, '#000000')
                    else:  # Generate dynamic colors for other categories
                        color = plt.cm.tab10(idx % 10)

                    ax.plot(subset['Relative_Segment'], subset[f'{dependent_variable}_pred'], label=f'{metric_label}: {value}', color=color)

                ax.set_xlabel('Relative Segment', fontsize=14, color="#0209ef")
                ax.set_ylabel(dep_var_titles.get(dependent_variable, dependent_variable), fontsize=14, color="#0209ef")
                ax.set_title(f'{dep_var_titles.get(dependent_variable, dependent_variable)} at different {metric_label} scores ({name})', fontsize=16, color="#0209ef")

                # Customize ticks, spines, and legend
                ax.tick_params(axis='x', colors="#0209ef")
                ax.tick_params(axis='y', colors="#0209ef")
                for spine in ax.spines.values():
                    spine.set_color("#0209ef")

                legend = ax.legend()
                for text in legend.get_texts():
                    text.set_color("#0209ef")
                ax.grid(True)

                # Create directory structure
                dependent_var_dir = os.path.join(root_directory, dep_var_titles.get(dependent_variable, dependent_variable))
                dataset_dir = os.path.join(dependent_var_dir, name)
                os.makedirs(dataset_dir, exist_ok=True)

                png_filename = os.path.join(dataset_dir, f'{dep_var_titles.get(dependent_variable, dependent_variable)}_{name}_{metric_label.replace(" ", "")}.png')
                plt.savefig(png_filename, dpi=300)
                plt.close()
                print(f"Graph saved in {png_filename}")

    except Exception as e:
        print(f"Error in GLM {metric_label} ({name}): {e}")

# Iterate over each dataset
for data_info in datasets:
    filename = data_info['filename']
    name = data_info['name']

    print(f"Analyzing dataset: {filename} ({name}).")

    # Read data
    df = pd.read_csv(filename)
    df_genderfilter = df[df['Gender'].isin(['male', 'female'])]

    # GLM models for different factors
    factors = [
        ('Literary_Quality', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * Literary_Quality', 'Literary Quality'),
        ('literary_category', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * literary_category', 'Literary Category'),
        ('General_Quality', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * General_Quality', 'General Quality'),
        ('general_category', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * general_category', 'General Category'),
        ('genre', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * genre', 'Genre'),
        ('Gender', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * Gender', 'Gender'),
        ('genre_gender_interaction', 'np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * genre * Gender', 'Genre and Gender Interaction')
    ]

    # Add categorical columns if not already present
    if 'literary_category' not in df.columns:
        df['literary_category'] = pd.qcut(df['Literary_Quality'], q=3, labels=['Low', 'Medium', 'High'])
    if 'general_category' not in df.columns:
        df['general_category'] = pd.qcut(df['General_Quality'], q=3, labels=['Low', 'Medium', 'High'])

    dependent_variables = ['Staging_z', 'PlotProgression_z', 'CognitiveTension_z']

    for dependent_variable in dependent_variables:
        for factor_name, formula, label in factors:
            full_formula = f'{dependent_variable} ~ {formula}'
            if 'genre * Gender' in formula:
                run_glm_and_save_results(df_genderfilter, full_formula, dependent_variable, label, factor_name, name)
            else:
                run_glm_and_save_results(df, full_formula, dependent_variable, label, factor_name, name)


# Parse and expand coefficients
expanded_coeffs = []

for result in results_summary:
    coef_dict = result['Coefficients']
    coef_df = pd.DataFrame(coef_dict['data'], columns=coef_dict['columns'], index=coef_dict['index'])
    coef_df.reset_index(inplace=True)
    coef_df.rename(columns={'index': 'Parameter'}, inplace=True)
    coef_df['Dataset'] = result['Dataset']
    coef_df['Dependent_Variable'] = result['Dependent_Variable']
    coef_df['Metric'] = result['Metric']
    expanded_coeffs.append(coef_df)

# Combine expanded coefficients
expanded_coeffs_df = pd.concat(expanded_coeffs, ignore_index=True)

results_df = pd.DataFrame(results_summary).drop(columns=['Coefficients'])
merged_df = results_df.merge(
    expanded_coeffs_df,
    on=['Dataset', 'Dependent_Variable', 'Metric'],
    how='inner'
)

# Save results in a CSV file
results_csv_path = os.path.join(root_directory, 'glm_results_summary.csv')
merged_df.to_csv(results_csv_path, index=False)
print(f"Summary results saved in {results_csv_path}.")
