In [13]:
import matplotlib
matplotlib.use('TkAgg')  # Or 'Qt5Agg', 'WXAgg', etc.
import matplotlib.pyplot as plt
import pandas as pd
import scikit_posthocs as sp
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
import scipy.stats as stats
import warnings

# --- Variable Encoding Function ---
def encode_variables(df):  # Changed function name to English
    """
    Encodes the DataFrame variables according to the specified plan.

    Args:
        df: pandas DataFrame with the original data.

    Returns:
        DataFrame: DataFrame with encoded variables.
    """

    df_encoded = df.copy()

    # 1. anxiety_level: Ordinal Encoding (0, 1, 2, 3)
    anxiety_mapping = {
        "minimal": 0,
        "mild": 1,
        "moderate": 2,
        "severe": 3,
    }
    df_encoded["anxiety_level"] = df_encoded["anxiety_level"].map(anxiety_mapping)

    # 2. sex: Binary Encoding (0 and 1)
    sex_mapping = {"female": 0, "male": 1}
    df_encoded["sex"] = df_encoded["sex"].map(sex_mapping)

    # 3. education_level: Ordinal Encoding (0, 1, 2)
    education_mapping = {
        "technical level": 0,
        "bachelor": 1,
        "graduate": 2,
    }
    df_encoded["education_level"] = df_encoded["education_level"].map(
        education_mapping
    )

    # 4. shift: One-Hot Encoding
    df_encoded = pd.get_dummies(
        df_encoded, columns=["shift"], prefix="shift", dummy_na=False
    )

    # 5. marital_status: One-Hot Encoding + Handling 'widowed'
    df_encoded["marital_status"] = df_encoded["marital_status"].replace(
        "widowed", "single"
    )  # Group with 'single'
    df_encoded = pd.get_dummies(
        df_encoded, columns=["marital_status"], prefix="marital", dummy_na=False
    )

    # 6. category: One-Hot Encoding + Grouping
    df_encoded["category"] = df_encoded["category"].replace(
        ["spec nurse", "head nurse"], "other"
    )  # Group
    df_encoded = pd.get_dummies(
        df_encoded, columns=["category"], prefix="category", dummy_na=False
    )

    # 7. age_range: Ordinal Encoding
    age_categories = ["20 to 29", "30 to 39", "40 to 49", "50 and over"]
    age_encoder = OrdinalEncoder(categories=[age_categories])
    df_encoded["age_range"] = age_encoder.fit_transform(
        df_encoded[["age_range"]]
    )
    df_encoded["age_range"] = df_encoded["age_range"].astype(int)

    # 8. seniority_range: Ordinal Encoding
    seniority_categories = [
        "1 to 5",
        "6 to 10",
        "11 to 15",
        "16 to 20",
        "21 and over",
    ]
    seniority_encoder = OrdinalEncoder(categories=[seniority_categories])
    df_encoded["seniority_range"] = seniority_encoder.fit_transform(
        df_encoded[["seniority_range"]]
    )
    df_encoded["seniority_range"] = df_encoded["seniority_range"].astype(int)

    return df_encoded


# --- Data Augmentation Function with SMOTE ---
def augment_data_smote(
    df, target_variable, sampling_strategy="auto", random_state=None, k_neighbors=None
):
    """
    Applies SMOTE to oversample the target variable, with improved error handling
    and k_neighbors adjustment.

    Args:
        df: pandas DataFrame with the encoded data.
        target_variable: Name of the column containing the (categorical) target variable.
        sampling_strategy: Sampling strategy.
        random_state: Random seed.
        k_neighbors: Number of neighbors.

    Returns:
        DataFrame, Series: DataFrame with augmented predictor variables and Series with the
                         augmented target variable.  Or None, None if there's an error.
    """

    X = df.drop(target_variable, axis=1)
    y = df[target_variable]

    if not pd.api.types.is_numeric_dtype(y):
        print("Error: The target variable must be numeric (ordinally encoded).")
        return None, None

    if len(y.unique()) < 2:
        print("Error: The target variable must have at least two different classes.")
        return None, None

    class_counts = Counter(y)
    min_samples = min(class_counts.values())

    if k_neighbors is None:
        k_neighbors = min(5, min_samples - 1)
    else:
        k_neighbors = min(k_neighbors, min_samples - 1)

    if k_neighbors < 1:
        print("Error: At least one class has very few samples. SMOTE cannot be applied.")
        print("Consider grouping minority classes or removing the solitary sample.")
        return None, None

    if min_samples <= 1:
        print("Error: At least one class has only one sample. SMOTE cannot be applied.")
        print("Consider grouping minority classes or removing the solitary sample before applying SMOTE")
        return None, None

    print(f"Using k_neighbors = {k_neighbors} for SMOTE.")

    try:
        smote = SMOTE(
            sampling_strategy=sampling_strategy,
            random_state=random_state,
            k_neighbors=k_neighbors,
        )
        X_resampled, y_resampled = smote.fit_resample(X, y)
    except ValueError as e:
        print(f"Error applying SMOTE: {e}")
        return None, None

    print("Class distribution before SMOTE:", Counter(y))
    print("Class distribution after SMOTE:", Counter(y_resampled))

    return X_resampled, y_resampled


def analyze_spearman_correlation(df, target_variable):
    """
    Calculates and visualizes Spearman correlations between the (ordinal) target variable
    and other numerical/ordinal variables in the DataFrame.

    Args:
        df: pandas DataFrame with the data (already augmented and encoded).
        target_variable: Name of the column containing the (ordinal) target variable.

    Returns:
        None (prints the correlation matrix and shows a heatmap).
    """

    # Calculate the Spearman correlation matrix
    spearman_correlations = df.corr(method="spearman")

    # Extract correlations with the target variable
    correlations_with_target = spearman_correlations[target_variable].drop(
        target_variable
    )  # Exclude self-correlation

    # Print correlations with the target variable
    print("Spearman Correlations with", target_variable, ":\n")
    print(correlations_with_target.sort_values(ascending=False))

    # Visualize with a heatmap (all variables)
    plt.figure(figsize=(12, 10))
    sns.heatmap(spearman_correlations, annot=True, cmap="coolwarm", center=0)
    plt.title("Spearman Correlation Heatmap")
    plt.tight_layout()
    plt.savefig("spearman_correlation_heatmap.png")  # Save the figure
    plt.close() #Close figure

    # Visualize with a bar plot (only correlations with the target variable)
    plt.figure(figsize=(8, 6))
    correlations_with_target.sort_values().plot(kind="barh", color="skyblue")
    plt.title(f"Spearman Correlation with {target_variable}")
    plt.xlabel("Spearman Correlation Coefficient")
    plt.ylabel("Variables")
    plt.tight_layout()
    plt.savefig("spearman_correlation_barplot.png")  # Save the figure
    plt.close()


def inferential_analysis(df, target_variable):
    """
    Performs Kruskal-Wallis and Chi-square tests to analyze relationships
    between the target variable and other variables in the DataFrame.

    Args:
        df: pandas DataFrame with the data (already augmented and encoded).
        target_variable: Name of the column containing the (ordinal) target variable.

    Returns:
        None (prints the test results).
    """

    # --- 1. Kruskal-Wallis (for numerical/ordinal variables) ---
    print("-" * 50)
    print("KRUSKAL-WALLIS TESTS")
    print("-" * 50)

    # List of numerical/ordinal variables (excluding target and one-hot encoded)
    numerical_ordinal_vars = [
        col
        for col in df.columns
        if (
            pd.api.types.is_numeric_dtype(df[col])
            and col != target_variable
            and not col.startswith(("shift_", "marital_", "category_"))
        )
    ]

    for variable in numerical_ordinal_vars:
        # Group data by levels of the target variable
        groups = [
            df[variable][df[target_variable] == level]
            for level in df[target_variable].unique()
        ]

        # Check for at least two groups and non-empty groups
        if len(groups) < 2:
            print(
                f"Cannot perform Kruskal-Wallis on {variable}: fewer than two groups."
            )
            continue
        if any(len(group) == 0 for group in groups):
            print(
                f"Cannot perform Kruskal-Wallis on {variable}: at least one group is empty."
            )
            continue

        # Perform Kruskal-Wallis test
        try:
            statistic, p_value = stats.kruskal(*groups)
        except ValueError as e:
            print(f"Error performing Kruskal-Wallis on {variable}: {e}")
            print("This may occur if all values in a group are equal.")
            continue

        print(f"\nKruskal-Wallis: {variable} vs. {target_variable}")
        print(f"  H Statistic: {statistic:.3f}")
        print(f"  p-value: {p_value:.3f}")

        # Interpretation
        if p_value < 0.05:
            print(
                f"  Result: Significant differences between levels of {target_variable} on variable {variable}."
            )

            # --- Post-Hoc Tests (Dunn with Bonferroni/Holm correction) ---
            # Dunn's test
            dunn_result = sp.posthoc_dunn(
                a=df,
                val_col=variable,
                group_col=target_variable,
                p_adjust="bonferroni",  # Or 'holm'
            )
            print("\n  Dunn's Test (Post-Hoc):")
            print(dunn_result)

        else:
            print(
                f"  Result: No significant differences between levels of {target_variable} on variable {variable}."
            )

    # --- 2. Chi-square (for categorical variables) ---
    print("\n" + "-" * 50)
    print("CHI-SQUARE TESTS")
    print("-" * 50)

    # List of categorical variables (including one-hot encoded)
    categorical_vars = [
        col
        for col in df.columns
        if col
        in (
            "sex",
            "shift_afternoon",
            "shift_morning",
            "shift_night a",
            "shift_night b",
            "marital_domestic partnership",
            "marital_married",
            "marital_single",
            "category_gen nurse",
            "category_nurse aux",
            "category_other",
            "education_level"
        )
    ]

    for variable in categorical_vars:
        # Create contingency table
        contingency_table = pd.crosstab(df[target_variable], df[variable])
        print(f"\nContingency Table: {target_variable} vs. {variable}")
        print(contingency_table)

        # Perform Chi-square test
        try:
            chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
            print(f"\nChi-square: {variable} vs. {target_variable}")
            print(f"  Chi2 Statistic: {chi2:.3f}")
            print(f"  p-value: {p_value:.3f}")
            print(f"  Degrees of freedom: {dof}")

            # Interpretation
            if p_value < 0.05:
                print(
                    f"  Result: Significant association between {target_variable} and {variable}."
                )
                # Calculate Cramer's V
                n = contingency_table.sum().sum()
                phi2 = chi2 / n
                r, k = contingency_table.shape
                phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
                rcorr = r - ((r - 1) ** 2) / (n - 1)
                kcorr = k - ((k - 1) ** 2) / (n - 1)
                cramers_v = (phi2corr / min((kcorr - 1), (rcorr - 1))) ** 0.5
                print(f"  Cramer's V: {cramers_v:.3f}")

            else:
                print(
                    f"  Result: No significant association between {target_variable} and {variable}."
                )

        except ValueError as e:
            print(f"Error performing Chi-square on {variable}: {e}")
            print("This may occur if any expected frequencies are zero.")
            continue



In [16]:
# --- Main Flow ---

# 1. Load Data
df = pd.read_csv(r"D:\ansiedad\AnxietyLevelByCovid.csv")

In [17]:
# 2. Encode Variables
df_encoded = encode_variables(df)


In [18]:
# 3. Augment Data
sampling_strategy = {
    0: 100,  # minimal
    1: 100,  # mild
    2: 100,  # moderate
    3: 100,  # severe
}

X_resampled, y_resampled = augment_data_smote(
    df_encoded,
    target_variable="anxiety_level",
    sampling_strategy=sampling_strategy,
    random_state=42,
    #k_neighbors=3,  # Optional
)

Using k_neighbors = 5 for SMOTE.
Class distribution before SMOTE: Counter({0: 62, 1: 49, 2: 22, 3: 7})
Class distribution after SMOTE: Counter({1: 100, 0: 100, 2: 100, 3: 100})


In [19]:
# 4. Create Augmented DataFrame
if X_resampled is not None and y_resampled is not None:
    df_augmented = pd.DataFrame(X_resampled, columns=X_resampled.columns)
    df_augmented["anxiety_level"] = y_resampled

    # --- Now work with df_augmented ---
    print("\nAugmented DataFrame:")
    print(df_augmented.head())
    print(df_augmented.shape)

    # --- Perform correlation analysis, inferential analysis, etc. ---
    analyze_spearman_correlation(df_augmented, "anxiety_level")
    inferential_analysis(df_augmented, "anxiety_level")


else:
    print("Data augmentation failed. Check for errors.")


Augmented DataFrame:
   sex  education_level  age_range  seniority_range  shift_afternoon  \
0    0                1          0                0            False   
1    0                2          2                3            False   
2    1                2          1                2            False   
3    0                1          2                3            False   
4    0                2          1                2            False   

   shift_morning  shift_night a  shift_night b  marital_domestic partnership  \
0          False           True          False                          True   
1           True          False          False                         False   
2           True          False          False                         False   
3           True          False          False                          True   
4           True          False          False                         False   

   marital_married  marital_single  category_gen nurse  category

In [20]:
df_augmented.shape

(400, 15)

In [21]:
analyze_spearman_correlation(df_augmented, "anxiety_level")

Spearman Correlations with anxiety_level :

education_level                 0.085992
category_other                 -0.065167
marital_single                 -0.067109
shift_night a                  -0.090665
shift_night b                  -0.116411
category_gen nurse             -0.144150
shift_afternoon                -0.155972
shift_morning                  -0.160083
sex                            -0.170128
marital_domestic partnership   -0.182868
marital_married                -0.198373
age_range                      -0.208170
category_nurse aux             -0.214688
seniority_range                -0.267693
Name: anxiety_level, dtype: float64


## Spearman Correlation Analysis Summary

This section summarizes the findings from the Spearman rank correlation analysis, focusing on the relationships between the `anxiety_level` (our ordinal target variable) and other predictor variables in the dataset (after data augmentation with SMOTE).

**Key Findings:**

*   **Generally Weak Correlations:**  Overall, most predictor variables exhibit weak correlations with `anxiety_level`, with no Spearman correlation coefficients exceeding |0.3|. This suggests that anxiety, as measured in this study, is likely influenced by a combination of factors, rather than any single strong predictor.
* **Negative Correlations:**
    *  `seniority_range` and `age_range` show the strongest negative correlation, indicating a mild tendency.
    *   `category_other` (representing specialist and head nurses) shows a notable negative correlation (-0.21) with `anxiety_level`.  This suggests that nurses in these roles tend to report lower anxiety levels compared to general and auxiliary nurses.
    *   `education_level` exhibits a weak negative correlation (-0.14), suggesting a slight tendency for nurses with higher education levels to report lower anxiety.
*   **Positive Correlations:**
    *   `category_nurse_aux` shows the strongest positive correlation, indicating a mild tendency.
    * `sex` exhibits a very weak possitive correlation.
*   **Multicollinearity:**  A strong positive correlation (0.69) exists between `age_range` and `seniority_range`.  This indicates multicollinearity, which should be addressed before using these variables together in regression models.  Consider using only one of these variables or creating a composite variable.
* **Variables with very weak correlation**
    *  `shift_afternoon`, `shift_morning`, `marital_married`, `shift_night a`, `shift_night b`, `marital_domestic partnership`, `marital_single`, `category_gen nurse`.

**Next Steps:**

These correlation findings provide a valuable starting point for further investigation.  The next steps involve:

1.  **Hypothesis Testing:** Conducting non-parametric tests (Kruskal-Wallis for numerical/ordinal predictors, Chi-square for categorical predictors) to formally test the statistical significance of the observed associations.
2.  **Visualization:** Creating visualizations (box plots, stacked bar plots) to explore the relationships between `anxiety_level` and other variables in more detail.
3.  **Predictive Modeling:** Building classification models (e.g., ordinal logistic regression, decision trees) to predict `anxiety_level`, while carefully addressing the multicollinearity issue and considering potential interactions between variables.

**Important Note:** Correlation does not imply causation.  These findings highlight associations, but further research is needed to understand the causal mechanisms underlying anxiety levels in nurses.

In [22]:
inferential_analysis(df_augmented, "anxiety_level")

--------------------------------------------------
KRUSKAL-WALLIS TESTS
--------------------------------------------------

Kruskal-Wallis: sex vs. anxiety_level
  H Statistic: 31.240
  p-value: 0.000
  Result: Significant differences between levels of anxiety_level on variable sex.

  Dunn's Test (Post-Hoc):
          0         1         2         3
0  1.000000  0.026143  0.290413  0.290413
1  0.026143  1.000000  0.000008  0.000008
2  0.290413  0.000008  1.000000  1.000000
3  0.290413  0.000008  1.000000  1.000000

Kruskal-Wallis: education_level vs. anxiety_level
  H Statistic: 3.744
  p-value: 0.290
  Result: No significant differences between levels of anxiety_level on variable education_level.

Kruskal-Wallis: age_range vs. anxiety_level
  H Statistic: 25.195
  p-value: 0.000
  Result: Significant differences between levels of anxiety_level on variable age_range.

  Dunn's Test (Post-Hoc):
         0         1         2         3
0  1.00000  1.000000  1.000000  0.000140
1  1.00000

## Statistical Analysis: Key Conclusions

This section summarizes the key findings from the statistical analysis, building upon the exploratory data analysis and correlation analysis.  We used non-parametric hypothesis tests (Kruskal-Wallis and Chi-square) to assess the statistical significance of relationships between the ordinal `anxiety_level` variable and other predictors.

**Main Conclusions:**

*   **Significant Associations with Anxiety:** Several demographic and work-related factors show statistically significant associations with nurses' anxiety levels during the COVID-19 pandemic (p < 0.05, after appropriate corrections for multiple comparisons).  These factors include:
    *   **Sex:**  There is a significant association between sex and anxiety levels, as confirmed by both the Kruskal-Wallis and Chi-square tests. Cramer's V suggests a moderate association strength.
    *   **Age Range (`age_range`):**  Kruskal-Wallis revealed significant differences in anxiety levels across age groups. Post-hoc Dunn tests indicated that the "severe" anxiety group had a significantly different age distribution compared to the other anxiety levels.
    *   **Seniority Range (`seniority_range`):**  Similar to age, seniority shows significant differences in anxiety levels across groups, confirmed by Kruskal-Wallis and Dunn's post-hoc tests (with the "severe" group differing significantly).
    *   **Shift (`shift_afternoon`, `shift_morning`, `shift_night a`, `shift_night b`):**  Chi-square tests showed significant associations between anxiety levels and *all* shift types.  This indicates that the distribution of anxiety levels differs across the various shifts.
    *   **Marital Status (`marital_domestic partnership`, `marital_married`, `marital_single`):** Chi-square tests revealed significant associations between anxiety levels and all marital status categories.
    *   **Nurse Category (`category_gen nurse`, `category_nurse aux`, `category_other`):**  Chi-square tests showed significant associations between anxiety and all nurse category types.  This reinforces the importance of the nurse's role in their anxiety experience.

*   **Non-Significant Association with Education Level:**  `education_level` did *not* show a statistically significant association with anxiety levels in either the Kruskal-Wallis or Chi-square tests.  This suggests that, in this sample, education level alone is not a strong predictor of anxiety.

* **Strength of Associations:**  While several associations were statistically significant, it's important to remember that the correlation analysis (Spearman) generally showed weak correlations.  The Chi-square tests, while showing significance, also had Cramer's V values that were mostly in the low to moderate range. This reinforces the idea that anxiety is likely influenced by multiple factors, each with a relatively small to moderate individual effect.
