In [1]:
# !pip install fastai

In [2]:
# pip install statsmodels

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from IPython.display import display
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import linregress

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
#input=pd.read_csv('/content/drive/MyDrive/DataSets/Medical Student Mental Health.csv')
input = pd.read_csv(r'C:\Users\User\Downloads\Medical Student Mental Health.csv')


In [None]:
input.head()

In [None]:
input.info()

In [8]:
# Get the last 5 column names
last_10_columns = input.columns[-10:]

# New names for the last 5 columns
new_names = ["total_empathy","cognitive_empathy","affective_empathy","modify_presentation","emotion_recognition","depression", "anxiety", "exhaustion", "cynicism", "efficacy"]

# Create a dictionary for renaming
rename_dict = {old: new for old, new in zip(last_10_columns, new_names)}

# Rename the last 5 columns in-place
input.rename(columns=rename_dict, inplace=True)

In [None]:
input.info()

In [None]:
input.describe()

In [None]:
col=['age','sex','year','part','job','psyt','health','glang']

for var in col:
  value_counts = input[var].value_counts()
  # Calculate the percentage of each value
  percentage_counts = value_counts / len(input) * 100
  print("Percentage of each unique value in " +var+ " column:")
  print(percentage_counts)

In [12]:
#plt.rcParams['text.usetex'] = False

In [13]:
#!apt-get install texlive-full

In [None]:
# Function to create pie charts
def create_pie_chart(ax, column_name, data):
    counts = data[column_name].value_counts()
    labels = counts.index
    ax.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
    ax.set_title(f'Distribution of {column_name}')
    ax.axis('equal')

#
columns=['sex','part','job','health','psyt']
# Number of columns for the subplot grid
num_columns = 3
num_plots = len(columns)

# Calculate number of rows needed
num_rows = (num_plots + num_columns - 1) // num_columns

fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, num_rows * 5))

# Flatten axes array if there's more than one row
if num_rows > 1:
    axes = axes.flatten()

# Create pie charts for each categorical variable
for i, column in enumerate(columns):
    create_pie_chart(axes[i], column, input)

# Remove any unused subplots
for i in range(num_plots, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


# Table 1. Descriptive statistics of the final sample (N = 886)

| Sociodemographics       | M (SD)          | Percent |
|-------------------------|-----------------|---------|
| **Gender**              |                 |         |
| Female                  |                 | 68.40   |
| Male                    |                 | 31.04   |
| Non-binary              |                 | 0.56    |
| **Age**                 | 22.38 (3.30)    |         |
| **Curriculum year**     |                 |         |
| B1                      |                 | 27.65   |
| B2                      |                 | 15.24   |
| B3                      |                 | 16.14   |
| M1                      |                 | 13.88   |
| M2                      |                 | 14.33   |
| M3                      |                 | 12.75   |
| **Mother tongue**       |                 |         |
| French                  |                 | 80.93   |
| Italian                 |                 | 5.08    |
| German                  |                 | 3.50    |
| Portuguese              |                 | 3.05    |
| English                 |                 | 2.48    |
| Spanish                 |                 | 2.03    |
| Othera                  |                 | 2.92    |
| **Having a partner**    |                 | 56.32   |
| **Having a paid job**   |                 | 34.88   |
| **Hours of study/weekc**| 25.29 (15.93)   |         |
| **Satisfaction with healthb** | 3.78 (1.06) |         |
| **Consulted a psy last year** |          | 22.46   |

In [None]:
# Check the unique values in each column
for column in input.columns:
    unique_values = sorted(input[column].unique())
    print(f"Column '{column}' has {len(unique_values)} unique values: {unique_values}")

In [None]:
input.hist(bins=50, figsize=(20,15))
plt.show()

In [17]:
# for stat analysis , using whole set ( as we won't predict anything)
from sklearn.model_selection import train_test_split
train_set, test_set = input,input

In [18]:
train_set_copy= train_set.copy()

In [None]:
train_set_copy.plot(kind="scatter",x="year",y="depression", alpha=0.1)
train_set_copy.plot(kind="scatter",x="sex",y="depression")

# **Correlation Study**

In [None]:
corr_matrix = train_set_copy.corr()
display(corr_matrix)

In [None]:
last_10_columns = train_set_copy.iloc[:, -10:]
corr_matrix_last_10 = last_10_columns.corr()
display(corr_matrix_last_10)

In [None]:
plt.figure(figsize=(8, 5))  # Set the size of the plot
sns.heatmap(corr_matrix_last_10, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

# **Model Training**

In [None]:
!pip install mord

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, confusion_matrix
from mord import LogisticIT
import numpy as np

# Load the data
data = input

# Select features and target
features = ['age', 'year', 'sex', 'total_empathy', 'cognitive_empathy', 'affective_empathy',
            'emotion_recognition', 'depression', 'anxiety', 'exhaustion', 'cynicism', 'efficacy',
            'stud_h', 'job', 'part']
target = 'health'

X = data[features]
y = data[target]

# Preprocess the data
# Encode categorical variables
X = pd.get_dummies(X, columns=['sex', 'job', 'part'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticIT()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

data=input

# Assuming `data` is your DataFrame and `health` is your target variable
X = data.drop('health', axis=1)  # Features
y = data['health']  # Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test_scaled)

# Evaluate the model
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


# **Relation of mental distress and academic efficacy**

In [None]:
target_columns = ['depression', 'anxiety', 'exhaustion', 'cynicism']

# Create a figure and subplots
fig, axs = plt.subplots(1, len(target_columns), figsize=(20, 5))

# Loop through the target columns and create scatter plots
for ax, column in zip(axs, target_columns):
    train_set_copy.plot(kind="scatter", x=column, y="efficacy", ax=ax)
    ax.set_title(f'Efficacy vs {column.capitalize()} ')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the font for the plots
plt.style.use('default')  # Reset style to default
plt.rc('font', family='serif', size=15)

target_columns = ['depression', 'anxiety', 'exhaustion', 'cynicism']

# Map numeric values to gender labels
gender_labels = {1: 'Male', 2: 'Female', 3: 'Non-binary'}

# Loop through each target column and create a separate scatter plot image
for column in target_columns:
    # Create a new figure for each plot
    plt.figure(figsize=(8, 6))  # Adjust the figsize as needed

    # Create scatter plot
    sns.scatterplot(
        x=column,
        y="efficacy",
        hue="sex",
        palette={'Male': 'blue', 'Female': 'red', 'Non-binary': 'green'},
        data=train_set_copy.replace({'sex': gender_labels})  # Replace numeric values with labels
    )

    # Set the title for the plot
    #plt.title(f'Efficacy vs {column.capitalize()}')

    # Set the x-axis and y-axis labels with capitalized first letters
    plt.xlabel(column.capitalize(), fontfamily='serif', fontsize=25)
    plt.ylabel('Efficacy', fontfamily='serif', fontsize=25)

     # Add a grid with light dotted lines
    plt.grid(True, linestyle=':', linewidth=1, color='grey')

    # Add the legend
    plt.legend(fontsize=20, loc='upper right')

    # Save the plot as a separate image file
    plt.savefig(f'content/Efficacy_vs_{column.capitalize()}.pdf', dpi=600, bbox_inches='tight')

    # Show the plot
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the font for the plots
plt.style.use('default')
plt.rc('font', family='serif', size=14)

target_columns = ['depression', 'anxiety', 'exhaustion', 'cynicism']

# Map numeric values to gender labels
gender_labels = {1: 'Male', 2: 'Female', 3: 'Non-binary'}

# Loop through each target column and create a separate scatter plot image
for column in target_columns:
    plt.figure(figsize=(8, 6))  # Adjust the figsize as needed

    # Plot Male data with non-filled circles
    sns.scatterplot(
        x=column,
        y="efficacy",
        data=train_set_copy.replace({'sex': gender_labels})[train_set_copy['sex'] == 1],
        marker='o',
        facecolors='none',  # Non-filled
        edgecolor='blue',
        label='Male'
    )

    # Plot Female data with filled circles
    sns.scatterplot(
        x=column,
        y="efficacy",
        data=train_set_copy.replace({'sex': gender_labels})[train_set_copy['sex'] == 2],
        marker='o',
        color='red',
        label='Female'
    )

    # Plot Non-binary data with filled triangles
    sns.scatterplot(
        x=column,
        y="efficacy",
        data=train_set_copy.replace({'sex': gender_labels})[train_set_copy['sex'] == 3],
        marker='v',
        color='green',
        label='Non-binary'
    )

    # Set the title for the plot
    plt.title(f'Efficacy vs {column.capitalize()}')

    # Set the x-axis and y-axis labels with capitalized first letters
    plt.xlabel(column.capitalize(), fontfamily='serif', fontsize=14)
    plt.ylabel('Efficacy', fontfamily='serif', fontsize=14)

    # Add a grid with light dotted lines
    plt.grid(True, linestyle=':', linewidth=1, color='grey')

    # Add the legend
    plt.legend(title='Gender')

    # Save the plot as a separate image file
   # plt.savefig(f'/content/Efficacy_vs_{column.capitalize()}.pdf', dpi=600, bbox_inches='tight')

    # Show the plot
    plt.show()


In [None]:
sns.set(style="whitegrid")

# Create a 2x2 grid of plots
fig = plt.figure(figsize=(12, 10))
gs = GridSpec(2, 2, figure=fig)

# Depression vs Efficacy
ax1 = fig.add_subplot(gs[0, 0])
sns.scatterplot(data=train_set_copy, x='depression', y='efficacy', hue='sex', palette={1: "steelblue", 2: "violet",3:"orange"}, ax=ax1)
#sns.regplot(data=train_set_copy, x='depression', y='efficacy', scatter=False, ax=ax1, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 1], x='depression', y='efficacy', scatter=False, ax=ax1, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 2], x='depression', y='efficacy', scatter=False, ax=ax1, color="violet")
ax1.set_title('Efficacy vs Depression')
ax1.legend()

# Anxiety vs Efficacy
ax2 = fig.add_subplot(gs[0, 1])
sns.scatterplot(data=train_set_copy, x='anxiety', y='efficacy', hue='sex', palette={1: "steelblue", 2: "violet",3:"orange"}, ax=ax2)
#sns.regplot(data=train_set_copy, x='anxiety', y='efficacy', scatter=False, ax=ax2, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 1], x='anxiety', y='efficacy', scatter=False, ax=ax2, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 2], x='anxiety', y='efficacy', scatter=False, ax=ax2, color="violet")
ax2.set_title('Efficacy vs Anxiety')
ax2.legend()

# Cynicism vs Efficacy
ax3 = fig.add_subplot(gs[1, 0])
sns.scatterplot(data=train_set_copy, x='cynicism', y='efficacy', hue='sex', palette={1: "steelblue", 2: "violet",3:"orange"}, ax=ax3)
#sns.regplot(data=train_set_copy, x='cynicism', y='efficacy', scatter=False, ax=ax3, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 1], x='cynicism', y='efficacy', scatter=False, ax=ax3, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 2], x='cynicism', y='efficacy', scatter=False, ax=ax3, color="violet")
ax3.set_title('Efficacy vs Cynicism')

# Exhaustion vs Efficacy
ax4 = fig.add_subplot(gs[1, 1])
sns.scatterplot(data=train_set_copy, x='exhaustion', y='efficacy', hue='sex', palette={1: "steelblue", 2: "violet",3:"orange"}, ax=ax4)
#sns.regplot(data=train_set_copy, x='exhaustion', y='efficacy', scatter=False, ax=ax4, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 1], x='exhaustion', y='efficacy', scatter=False, ax=ax4, color="steelblue")
sns.regplot(data=train_set_copy[train_set_copy['sex'] == 2], x='exhaustion', y='efficacy', scatter=False, ax=ax4, color="violet")
ax4.set_title('Efficacy vs Exhaustion')

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# Define a function to perform linear regression and return the slope and its p-value
def linear_regression_slope(data, x_var, y_var):
    X = sm.add_constant(data[x_var])  # Add a constant term for the intercept
    y = data[y_var]
    model = sm.OLS(y, X).fit()
    print(model.summary())
    slope = model.params[1]  # Get the slope (coefficient of the x_var)
    p_value = model.pvalues[1]  # Get the p-value of the slope
    return slope, p_value

# Calculate slopes and p-values for males and females
slope_male_depression, p_value_male_depression = linear_regression_slope(train_set_copy[train_set_copy['sex'] == 1], 'depression', 'efficacy')
slope_female_depression, p_value_female_depression = linear_regression_slope(train_set_copy[train_set_copy['sex'] == 2], 'depression', 'efficacy')

slope_male_anxiety, p_value_male_anxiety = linear_regression_slope(train_set_copy[train_set_copy['sex'] == 1], 'anxiety', 'efficacy')
slope_female_anxiety, p_value_female_anxiety = linear_regression_slope(train_set_copy[train_set_copy['sex'] == 2], 'anxiety', 'efficacy')

# Print the results
print(f"Male Depression Slope: {slope_male_depression}, p-value: {p_value_male_depression}")
print(f"Female Depression Slope: {slope_female_depression}, p-value: {p_value_female_depression}")
print(f"Male Anxiety Slope: {slope_male_anxiety}, p-value: {p_value_male_anxiety}")
print(f"Female Anxiety Slope: {slope_female_anxiety}, p-value: {p_value_female_anxiety}")

# **Relation of academic efficacy and empathy**

In [None]:
empathy_columns = train_set_copy.iloc[:, np.r_[10:15, 19]]
corr_matrix_empathy_columns = empathy_columns.corr()
display(corr_matrix_empathy_columns)

In [None]:
plt.figure(figsize=(8, 4))  # Set the size of the plot
sns.heatmap(corr_matrix_empathy_columns, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Filter the dataset for males (sex == 1)
male_data = train_set_copy[train_set_copy['sex'] == 1]
empathy_columns_male = male_data.iloc[:,np.r_[10:19]]
corr_matrix_empathy_male = empathy_columns_male.corr()

# Plot correlation heatmap for males
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_empathy_male, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Empathy Columns (Males)')
plt.show()

# Filter the dataset for females (sex == 2)
female_data = train_set_copy[train_set_copy['sex'] == 2]
empathy_columns_female = female_data.iloc[:, np.r_[10:19]]
corr_matrix_empathy_female = empathy_columns_female.corr()

# Plot correlation heatmap for females
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_empathy_female, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Empathy Columns (Females)')
plt.show()

# ***Correlation Network Analysis***

In [None]:
from statsmodels.stats.multitest import multipletests
from scipy.stats import pearsonr
import networkx as nx


data = train_set_copy.iloc[:, np.r_[10:13, 15:20]]
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Threshold for significance (P-value adjustment with Benjamini-Hochberg correction)
alpha = 0.05
p_values = np.array([pearsonr(data[col1], data[col2])[1]
                     for col1 in data.columns for col2 in data.columns]).reshape(correlation_matrix.shape)
_, corrected_p_values, _, _ = multipletests(p_values.flatten(), alpha=alpha, method='fdr_bh')
corrected_p_values = corrected_p_values.reshape(correlation_matrix.shape)

# Create a graph
G = nx.Graph()

# Add nodes (variables)
for col in data.columns:
    G.add_node(col)

# Add edges (correlations) only if significant after correction
for i, col1 in enumerate(data.columns):
    for j, col2 in enumerate(data.columns):
        if i != j and corrected_p_values[i, j] < alpha:
            weight = correlation_matrix.iloc[i, j]
            if abs(weight) >= 0.25:  # Consider correlations with r >= 0.25
                G.add_edge(col1, col2, weight=weight)

# Plot the graph
plt.figure(figsize=(6, 6))
pos = nx.spring_layout(G)
edges = G.edges(data=True)

nx.draw(G, pos, with_labels=True, node_size=500, node_color='lightblue', font_size=10)
nx.draw_networkx_edges(G, pos, edgelist=edges, width=[d['weight']*5 for (u, v, d) in edges])

plt.title('Correlation Network')
plt.show()

In [90]:
#Calculate correlation matrix and p-values
correlations = data.corr()
p_values = np.zeros_like(correlations)

for i in range(data.shape[1]):
    for j in range(data.shape[1]):
        if i != j:
            _, p_values[i, j] = pearsonr(data.iloc[:, i], data.iloc[:, j])

# Apply Benjamini-Hochberg correction for multiple comparisons
p_values_flat = p_values.flatten()
_, corrected_p_values_flat, _, _ = multipletests(p_values_flat, method='fdr_bh')
corrected_p_values = corrected_p_values_flat.reshape(p_values.shape)

# Apply thresholds (r >= 0.25 and p < 0.05)
significant_edges = np.where((np.abs(correlations) >= 0.25) & (corrected_p_values < 0.05), correlations, 0)

In [None]:
# Create an undirected graph
G = nx.Graph()

# Add nodes (symptoms)
for col in data.columns:
    G.add_node(col)


# Add edges for significant correlations with edge weights
for i in range(len(data.columns)):
    for j in range(i + 1, len(data.columns)):  # Avoid duplicate edges
        if significant_edges[i, j] != 0:
            # Determine edge color based on the sign of correlation
            color = 'green' if significant_edges[i, j] > 0 else 'red'
            G.add_edge(data.columns[i], data.columns[j], weight=significant_edges[i, j], color=color)

# Extract edges and colors
edges = G.edges(data=True)
weights = [d['weight'] for (u, v, d) in edges]
colors = [d['color'] for (u, v, d) in edges]

# Normalize weights for edge width
norm_weights = [abs(w) for w in weights]

# ciruclar layout for easy visualization
pos = nx.circular_layout(G)

# Plot the graph
plt.figure(figsize=(8, 6))

# Draw nodes
nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=1600)

# Draw edges with color coding based on weights
nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color=colors, width=[w*7 for w in norm_weights])

# Draw labels
nx.draw_networkx_labels(G, pos, font_size=10, font_color="black")

plt.style.use('default')  # Reset style to default
plt.rc('font', family='serif', size=14)

plt.title('Correlation Network with Weighted and Color-Coded Edges')

# Show the plot
plt.show()

In [None]:
# Original node names
node_names = {
    'total_empathy': 'Total\nEmpathy',
    'cognitive_empathy': 'Cognitive\nEmpathy',
    'affective_empathy': 'Affective\nEmpathy',
    'exhaustion': 'Exhaustion',
    'depression': 'Depression',
    'anxiety': 'Anxiety',
    'efficacy': 'Efficacy',
    'cynicism': 'Cynicism'

}

# Rename nodes in the graph
G = nx.relabel_nodes(G, node_names)

# Extract edges and colors again after renaming nodes
edges = G.edges(data=True)
weights = [d['weight'] for (u, v, d) in edges]
colors = [d['color'] for (u, v, d) in edges]

# Normalize weights for edge width
norm_weights = [abs(w) for w in weights]

# Circular layout for easy visualization
pos = nx.circular_layout(G,scale=1)
# pos = nx.spring_layout(G, k=30, seed=16) 

# Plot the graph
plt.figure(figsize=(7, 5.7))

# Draw nodes
#nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=7800)
nx.draw_networkx_nodes(G, pos, node_color='white', node_size=3100,edgecolors='black')

# Draw edges with color coding based on weights
nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color=colors, width=[w*7 for w in norm_weights])

# Draw labels
nx.draw_networkx_labels(G, pos, font_size=10, font_color="black", font_weight='regular')



plt.axis('off')

# Adjust layout to prevent cutting off nodes
#plt.tight_layout()

#Save the figure
plt.savefig('content/Correlation Network with Weighted and Color-Coded Edges-1.png', dpi=500, bbox_inches='tight')
plt.savefig('content/Correlation Network with Weighted and Color-Coded Edges-1.pdf', dpi=500, bbox_inches='tight')
# Show the plot
plt.show()


In [None]:
!pip install networkx

In [None]:
import networkx as nx
from networkx.algorithms.clique import find_cliques
from networkx.algorithms.approximation import clique

# Identify all maximal cliques
cliques = list(find_cliques(G))

# Find the largest maximal clique (i.e., the maximum clique)
maximum_clique = max(cliques, key=len)

print("Maximal Cliques: ", cliques)
print("Maximum Clique: ", maximum_clique)

In [None]:
### Identifying Cliques

# Identify all maximal cliques
cliques = list(find_cliques(G))

# Find the largest maximal clique (i.e., the maximum clique)
maximum_clique = max(cliques, key=len)

print("Maximal Cliques: ", cliques)
print("Maximum Clique: ", maximum_clique)

### Visualize Cliques

# Create a new plot to visualize the cliques // (10,6)-->3800-->19
# plt.figure(figsize=(11, 8))
plt.figure(figsize=(7, 5.7))

# Draw the base network again
# nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=7800)
nx.draw_networkx_nodes(G, pos, node_color='white', node_size=3100,edgecolors='black')
nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color=colors, width=[w*5 for w in norm_weights])
nx.draw_networkx_labels(G, pos, font_size=10, font_color="black")# was 10

# Highlight maximal cliques (all cliques) with blue edges
for clique in cliques:
    clique_edges = [(clique[i], clique[j]) for i in range(len(clique)) for j in range(i + 1, len(clique))]
    nx.draw_networkx_edges(G, pos, edgelist=clique_edges, edge_color='violet', width=3, alpha=0.6)

# Highlight the maximum clique with darker blue edges
max_clique_edges = [(maximum_clique[i], maximum_clique[j]) for i in range(len(maximum_clique)) for j in range(i + 1, len(maximum_clique))]
nx.draw_networkx_edges(G, pos, edgelist=max_clique_edges, edge_color='blue', width=5, alpha=1.0)

#plt.title('Correlation Network with Maximal and Maximum Cliques')
plt.axis('off')

#Save the figure
plt.savefig('content/Correlation Network with Maximal and Maximum Cliques-1.pdf', dpi=500, bbox_inches='tight')
plt.savefig('content/Correlation Network with Maximal and Maximum Cliques-1.png', dpi=500, bbox_inches='tight')
# Show the plot
plt.show()

In [None]:
# Calculate correlation matrix
correlations = data.corr()

# Calculate p-values for each correlation
p_values = np.zeros_like(correlations)
for i in range(data.shape[1]):
    for j in range(data.shape[1]):
        if i != j:
            _, p_values[i, j] = pearsonr(data.iloc[:, i], data.iloc[:, j])

# Apply Benjamini-Hochberg correction
p_values_flat = p_values.flatten()
_, corrected_p_values_flat, _, _ = multipletests(p_values_flat, method='fdr_bh')
corrected_p_values = corrected_p_values_flat.reshape(p_values.shape)

# Create a matrix for storing the correlation coefficients and significance asterisks
formatted_matrix = pd.DataFrame(index=correlations.index, columns=correlations.columns)

# Apply thresholds and add stars based on significance levels
for i in range(data.shape[1]):
    for j in range(i + 1):
        corr_value = correlations.iloc[i, j]
        if corrected_p_values[i, j] < 0.001:
            star = '***'
        elif corrected_p_values[i, j] < 0.01:
            star = '**'
        elif corrected_p_values[i, j] < 0.05:
            star = '*'
        else:
            star = ''
        formatted_matrix.iloc[i, j] = f"{corr_value:.2f}{star}"
        formatted_matrix.iloc[j, i] = ''  # Set upper triangle to empty

# Convert the formatted matrix to a LaTeX table format
latex_table = formatted_matrix.to_latex()

# Display the formatted correlation matrix
print("Formatted Correlation Matrix:\n")
print(formatted_matrix)

# Display the LaTeX table
print("\nLaTeX Table:\n")
print(latex_table)

In [None]:
from sklearn.covariance import GraphicalLassoCV

# Fit a Graphical Lasso model to estimate partial correlations
model = GraphicalLassoCV()
model.fit(data)

# Extract the partial correlation matrix
partial_corr_matrix = -model.precision_ / np.sqrt(np.outer(np.diag(model.precision_), np.diag(model.precision_)))

# Create a graph for partial correlations
G_partial = nx.Graph()

# Add nodes (variables)
for col in data.columns:
    G_partial.add_node(col)

# Add edges (partial correlations) only if significant
for i, col1 in enumerate(data.columns):
    for j, col2 in enumerate(data.columns):
        if i != j and abs(partial_corr_matrix[i, j]) >= 0.25:
            G_partial.add_edge(col1, col2, weight=partial_corr_matrix[i, j])

# Plot the partial correlation network
plt.figure(figsize=(8, 8))
pos = nx.circular_layout(G_partial)
edges = G_partial.edges(data=True)

nx.draw(G_partial, pos, with_labels=True, node_size=500, node_color='lightgreen', font_size=10)
nx.draw_networkx_edges(G_partial, pos, edgelist=edges, width=[abs(d['weight'])*5 for (u, v, d) in edges])

plt.title('Partial Correlation Network')
plt.show()


# **Relation between depression, anxiety, exhaustion, and academic efficacy.**

In [None]:
empathy_columns = train_set_copy.iloc[:, np.r_[15:20]]
corr_matrix_empathy_columns = empathy_columns.corr()
display(corr_matrix_empathy_columns)

In [None]:
# Capitalize the column and index names
corr_matrix_empathy_columns = corr_matrix_empathy_columns.rename(
    columns=lambda x: x.capitalize(),
    index=lambda x: x.capitalize()
)

plt.style.use('default')  # Reset style to default
# Set font and plot the heatmap
plt.rc('font', family='serif', size=12)
plt.figure(figsize=(8, 2))  # Set the size of the plot
sns.heatmap(corr_matrix_empathy_columns, annot=True, cmap='coolwarm', center=0)

# Set title and show the plot
#plt.title('Correlation Matrix Heatmap')
plt.savefig('content/cmh between dep,anx,exh,cy,efficacy.pdf', dpi=500, bbox_inches='tight')
plt.savefig('content/cmh between dep,anx,exh,cy,efficacy.png', dpi=500, bbox_inches='tight')
plt.show()

In [None]:
# Filter the dataset for males (sex == 1)
male_data = train_set_copy[train_set_copy['sex'] == 1]
empathy_columns_male = male_data.iloc[:, np.r_[15:20]]
corr_matrix_empathy_male = empathy_columns_male.corr()

# Plot correlation heatmap for males
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_empathy_male, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Empathy Columns (Males)')
plt.show()

# Filter the dataset for females (sex == 2)
female_data = train_set_copy[train_set_copy['sex'] == 2]
empathy_columns_female = female_data.iloc[:, np.r_[15:20]]
corr_matrix_empathy_female = empathy_columns_female.corr()

# Plot correlation heatmap for females
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_empathy_female, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Empathy Columns (Females)')
plt.show()

No meaningful corelation between efficacy and empathy factors.

In [None]:
# Filter the data to include only Male and Female
filtered_data = input.copy()
# Map the 'sex' column to 'Male', 'Female', and 'Other' strings
filtered_data['sex'] = filtered_data['sex'].map({1: 'Male', 2: 'Female', 3: 'Other'})

# Filter the data to include only Male and Female
filtered_data = filtered_data[filtered_data['sex'].isin(['Male', 'Female'])]
# Group by gender and calculate the mean impact of anxiety and cynicism on academic efficacy
mean_impacts = filtered_data.groupby('sex').agg({
    'anxiety': 'mean',
    'cynicism': 'mean',
    'efficacy': 'mean'
}).reset_index()

# Melt the DataFrame for easier plotting
mean_impacts_melted = mean_impacts.melt(id_vars='sex', value_vars=['anxiety', 'cynicism', 'efficacy'],
                                        var_name='Impact_Type', value_name='Mean_Impact')

# Create bar plots
plt.figure(figsize=(12, 6))
sns.barplot(data=mean_impacts_melted, x='Impact_Type', y='Mean_Impact', hue='sex')
plt.title('Gender Differences in Mental Health Impact on Academic Efficacy')
plt.xlabel('Impact Type')
plt.ylabel('Mean Impact')
plt.legend(title='Gender', labels=['Female', 'Male'])  # Adjust labels as per your dataset's encoding
plt.show()

In [None]:
df=input.copy()
# Map the 'sex' column to 'Male' and 'Female' strings
df['sex'] = df['sex'].map({1: 'Male', 2: 'Female'})
# Group by curriculum year and gender, then calculate the mean values
grouped_data = df.groupby(['year', 'sex']).agg({
    'total_empathy': 'mean',
    'depression': 'mean',
    'anxiety': 'mean'
}).reset_index()

# Melt the DataFrame for easier plotting
melted_data = grouped_data.melt(id_vars=['year', 'sex'],
                                value_vars=['total_empathy','depression','anxiety',],
                                var_name='Measure', value_name='Mean_Value')


plt.style.use('default')  # Reset style to default
plt.rc('font', family='serif', size=16)
# Create line plots
plt.figure(figsize=(8, 8))
sns.lineplot(data=melted_data, x='year', y='Mean_Value', hue='sex', style='Measure', markers=True, size= 19)
# Add a grid with light dotted lines
plt.grid(True, linestyle=':', linewidth=0.5, color='grey')
#plt.title('Trends in Empathy, Cynicism, and Academic Efficacy Across Curriculum Years, Separated by Gender')
plt.xlabel('Curriculum Year')
plt.ylabel('Mean Value')
legend = plt.legend(title='Gender and Measure', bbox_to_anchor=(1,1), loc='upper left',fontsize=16)
legend.get_texts()[0].set_text('Gender')
legend.get_texts()[1].set_text('Female')
legend.get_texts()[2].set_text('Male')
legend.get_texts()[3].set_text('Measure')
legend.get_texts()[4].set_text('Total Empathy')
legend.get_texts()[5].set_text('Depression')
legend.get_texts()[6].set_text('Anxiety')
plt.savefig('content/Trends in Empathy, Cynicism, and Academic Efficacy Across Curriculum Years, Separated by Gender.pdf', dpi=500, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'input' is your DataFrame
df = input.copy()

# Map the 'sex' column to 'Male' and 'Female' strings
df['sex'] = df['sex'].map({1: 'Male', 2: 'Female'})

# Group by curriculum year and gender, then calculate the mean values
grouped_data = df.groupby(['year', 'sex']).agg({
    'total_empathy': 'mean',
    'depression': 'mean',
    'anxiety': 'mean'
}).reset_index()

# Melt the DataFrame for easier plotting
melted_data = grouped_data.melt(id_vars=['year', 'sex'],
                                value_vars=['total_empathy', 'depression', 'anxiety'],
                                var_name='Measure', value_name='Mean_Value')

plt.style.use('default')  # Reset style to default
plt.rc('font', family='serif', size=12)

# Create line plots with distinguishable line styles and markers
plt.figure(figsize=(8, 4))
sns.lineplot(data=melted_data, x='year', y='Mean_Value', hue='sex', style='Measure',
             markers=['o', 's', '^'], dashes=[(2, 0), (2, 2), (6, 6)], markersize=10, linewidth=2.5)

plt.xlabel(column.capitalize(), fontfamily='serif', fontsize=20)
plt.ylabel(column.capitalize(), fontfamily='serif', fontsize=20)
plt.xlabel('Curriculum Year')
plt.ylabel('Mean Value')

# Manually set markers to have no fill
for line in plt.gca().get_lines():
    line.set_markerfacecolor('none')  # Remove marker fill (no fill)
    line.set_markeredgewidth(2)        # Set the marker edge width
    line.set_markeredgecolor(line.get_color())  # Set marker edge color to line color

# Add a grid with light dotted lines
plt.grid(True, linestyle=':', linewidth=0.75, color='black')

# Customize the legend for better clarity
legend = plt.legend(title='Gender and Measure', bbox_to_anchor=(1, 0.8), loc='upper left', fontsize=15)
legend=plt.legend( bbox_to_anchor=(1, 0.8),fontsize=15)
legend.get_texts()[0].set_text('')
legend.get_texts()[1].set_text('Female')
legend.get_texts()[2].set_text('Male')
legend.get_texts()[3].set_text('')
legend.get_texts()[4].set_text('Total Empathy')
legend.get_texts()[5].set_text('Depression')
legend.get_texts()[6].set_text('Anxiety')

# legend.get_texts()[0].set_text('Female')
# legend.get_texts()[1].set_text('Male')
# legend.get_texts()[2].set_text('Total Empathy')
# legend.get_texts()[3].set_text('Depression')
# legend.get_texts()[4].set_text('Anxiety')

# Save the plot as a PDF
plt.savefig('content/Trends in Empathy, Cynicism, and Academic Efficacy Across Curriculum Years, Separated by Gender.pdf', dpi=500, bbox_inches='tight')
# Save the plot as a PDF
plt.savefig('content/Trends in Empathy, Cynicism, and Academic Efficacy Across Curriculum Years, Separated by Gender.png', dpi=500, bbox_inches='tight')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

df=input.copy()
# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]


# Plotting setup
plt.figure(figsize=(12, 8))

colors = {1: 'blue', 2: 'pink'}
labels = {1: 'Male', 2: 'Female'}

for var in dependent_vars:
    plt.figure(figsize=(10, 6))

    for sex in [1, 2]:  # Loop over male and female
        # Filter data by sex
        sex_df = df[df['sex'] == sex]

        # Aggregate by Year
        aggregated_df = sex_df.groupby('year')[var].mean().reset_index()

        X = aggregated_df[['year']]
        y = aggregated_df[var]
        X = sm.add_constant(X)  # Adds a constant term to the predictor

        model = sm.OLS(y, X).fit()
        print(f"Regression results for {var} (Sex={sex}):\n", model.summary())

        # Plot the regression
        plt.scatter(aggregated_df['year'], aggregated_df[var], color=colors[sex], label=f'{labels[sex]} (data)')
        plt.plot(aggregated_df['year'], model.predict(X), color=colors[sex], label=f'{labels[sex]} (regression)')

    plt.xlabel('Year')
    plt.ylabel(f'Mean {var.replace("_", " ").title()}')
    plt.title(f'Mean {var.replace("_", " ").title()} vs Year by Sex')
    plt.legend()
    plt.show()


In [None]:
import statsmodels.formula.api as smf

# Mixed-Effects Models for each dependent variable by gender
for var in dependent_vars:
    for sex in [1, 2]:  # Loop over male and female
        # Filter data by sex
        sex_df = df[df['sex'] == sex]

        model = smf.mixedlm(f"{var} ~ year", data=sex_df, groups=sex_df["id"])
        result = model.fit()
        print(f"Mixed-Effects Model results for {var} (Sex={sex}):\n", result.summary())


In [None]:
import matplotlib.pyplot as plt
import numpy as np

dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]
# Initialize a plot for each variable
for var in dependent_vars:
    plt.figure(figsize=(10, 6))

    # Plot for each sex
    for sex in [1, 2]:
        # Filter data by sex
        sex_df = df[df['sex'] == sex]

        # Fit the mixed-effects model
        model = smf.mixedlm(f"{var} ~ year", data=sex_df, groups=sex_df["id"])
        result = model.fit()

        # Extract the coefficients
        intercept = result.params['Intercept']
        slope = result.params['year']

        # Generate the line of best fit
        years = np.linspace(sex_df['year'].min(), sex_df['year'].max(), 100)
        predicted_values = intercept + slope * years

        # Plot the original data points
        plt.scatter(sex_df['year'], sex_df[var], alpha=0.5, label=f'Sex={sex} Data')

        # Plot the regression line
        plt.plot(years, predicted_values, label=f'Sex={sex} Fit', linewidth=2)

    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel(var)
    plt.title(f'Regression Line for {var} by Sex')
    plt.legend()
    plt.show()


In [None]:
import statsmodels.genmod.generalized_estimating_equations as gee
from statsmodels.genmod.families import Gaussian

# GEE Model for each dependent variable by gender
for var in dependent_vars:
    for sex in [1, 2]:  # Loop over male and female
        # Filter data by sex
        sex_df = df[df['sex'] == sex]

        model = gee.GEE.from_formula(f"{var} ~ year", groups="id", data=sex_df, family=Gaussian())
        result = model.fit()
        print(f"GEE results for {var} (Sex={sex}):\n", result.summary())


In [None]:
!pip install pygam

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pygam import LinearGAM, s

# Assuming df is your dataframe and dependent_vars is a list of your variables
# Example for one dependent variable, e.g., 'total_empathy'

for var in dependent_vars:
    for sex in [1, 2]:  # Loop over male and female
        # Filter data by sex
        sex_df = df[df['sex'] == sex]

        X = sex_df['year'].values.reshape(-1, 1)
        y = sex_df[var].values

        gam = LinearGAM(s(0)).fit(X, y)  # Using smoothing spline (s(0) refers to 'year')

        XX = np.linspace(X.min(), X.max(), 100)
        preds = gam.predict(XX)


        plt.figure(figsize=(10, 6))
        plt.scatter(X, y, color='gray', alpha=0.5)
        plt.plot(XX, preds, color='red')
        plt.title(f"GAM fit for {var} (Sex={sex})")
        plt.xlabel('Year')
        plt.ylabel(var)
        plt.show()


In [None]:
pip install xgboost

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = input.copy()

# Independent variables: year and sex
X = df[['year', 'sex']]

# Encode sex as a categorical variable (optional, depends on the model)
X = pd.get_dummies(X, columns=['sex'], drop_first=True)

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df[dependent_vars], test_size=0.2, random_state=42)


In [None]:
# Random Forest
rf_results = {}
for var in dependent_vars:
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train[var])
    y_pred = rf.predict(X_test)
    rf_results[var] = {
        "RMSE": np.sqrt(mean_squared_error(y_test[var], y_pred)),
        "R2": r2_score(y_test[var], y_pred)
    }

print("Random Forest Results:")
for var, metrics in rf_results.items():
    print(f"{var}: RMSE={metrics['RMSE']}, R2={metrics['R2']}")


In [None]:
# Gradient Boosting
gb_results = {}
for var in dependent_vars:
    gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb.fit(X_train, y_train[var])
    y_pred = gb.predict(X_test)
    gb_results[var] = {
        "RMSE": np.sqrt(mean_squared_error(y_test[var], y_pred)),
        "R2": r2_score(y_test[var], y_pred)
    }

print("Gradient Boosting Results:")
for var, metrics in gb_results.items():
    print(f"{var}: RMSE={metrics['RMSE']}, R2={metrics['R2']}")


In [None]:
# XGBoost
xgb_results = {}
for var in dependent_vars:
    xgb = XGBRegressor(n_estimators=100, random_state=42)
    xgb.fit(X_train, y_train[var])
    y_pred = xgb.predict(X_test)
    xgb_results[var] = {
        "RMSE": np.sqrt(mean_squared_error(y_test[var], y_pred)),
        "R2": r2_score(y_test[var], y_pred)
    }

print("XGBoost Results:")
for var, metrics in xgb_results.items():
    print(f"{var}: RMSE={metrics['RMSE']}, R2={metrics['R2']}")


# **Regression of curriculum years and gender as independent variables and each of the empathy, mental health, and burnout indicators separately as dependent variables**

In [None]:
# Define a function to run and print linear regression results
def run_regression(df, dependent_var):
    formula = f"{dependent_var} ~ year + sex"
    model = ols(formula, data=df).fit()
    print(f"\nResults for {dependent_var}:")
    print(model.summary())

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# Run regressions for each dependent variable
for var in dependent_vars:
    run_regression(input, var)

In [None]:
import pandas as pd
from statsmodels.formula.api import ols

# Define a function to run and print linear regression results
def run_regression(df, dependent_var, sex_value):
    filtered_df = df[df['sex'] == sex_value]
    formula = f"{dependent_var} ~ year"
    model = ols(formula, data=filtered_df).fit()
    sex_label = 'Male' if sex_value == 1 else 'Female'
    print(f"\nResults for {dependent_var} ({sex_label}):")
    print(model.summary())

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# Run regressions for each dependent variable for both males and females
for var in dependent_vars:
    run_regression(input, var, sex_value=1)  # For males
    run_regression(input, var, sex_value=2)  # For females


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols

# Define a function to run and print linear regression results
def run_regression(df, dependent_var, sex_value):
    filtered_df = df.loc[df['sex'] == sex_value].copy()

    # Standardize the 'year' column and the dependent variable
    scaler = StandardScaler()
    filtered_df['year'] = scaler.fit_transform(filtered_df[['year']])
    filtered_df[dependent_var] = scaler.fit_transform(filtered_df[[dependent_var]])

    # Define the formula and fit the model
    formula = f"{dependent_var} ~ year"
    model = ols(formula, data=filtered_df).fit()

    # Print the summary
    sex_label = 'Male' if sex_value == 1 else 'Female'
    print(f"\nResults for {dependent_var} ({sex_label}):")
    print(model.summary())

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]
df=input
# Run regressions for each dependent variable for both males and females
for var in dependent_vars:
    run_regression(df, var, sex_value=1)  # For males
    run_regression(df, var, sex_value=2)  # For females


In [None]:
def plot_regression(ax, df, dependent_var):
    sns.scatterplot(ax=ax, x='year', y=dependent_var, hue='sex', data=df, palette={1: "steelblue", 2: "violet", 3: "orange"})
    sns.regplot(ax=ax, x='year', y=dependent_var, data=df[df['sex'] == 1], scatter=False, color="steelblue")
    sns.regplot(ax=ax, x='year', y=dependent_var, data=df[df['sex'] == 2], scatter=False, color="violet")
    ax.set_title(f'{dependent_var.capitalize()} vs Year')
    ax.set_xlabel('Year')
    ax.set_ylabel(dependent_var.capitalize())

# Create a 2x5 grid for plots
fig, axes = plt.subplots(2, 5, figsize=(20, 10))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Create regression plots in each subplot
for ax, var in zip(axes, dependent_vars):
    plot_regression(ax, input, var)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
df=input
# Filter data for sex = 1
df_sex1 = df[df['sex'] == 1]
for var in dependent_vars:
  slope, intercept, r_value, p_value, std_err = linregress(df_sex1['year'], df_sex1[var])
  print(f"Slope for {var} [Male] : {slope}")


# Filter data for sex = 2
df_sex2 = df[df['sex'] == 2]
for var in dependent_vars:
  slope, intercept, r_value, p_value, std_err = linregress(df_sex2['year'], df_sex2[var])
  print(f"Slope for {var} [Female] : {slope}")



In [None]:
import pandas as pd
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler

# Create dummy variables for 'year', with Bmed1 (year=1) as the reference category
data = pd.get_dummies(input, columns=['year'], drop_first=True)
data = pd.get_dummies(data, columns=['sex'])

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# List of independent variables
independent_vars = ['year_2', 'year_3', 'year_4', 'year_5', 'year_6',  'sex_2', 'sex_3']

# Standardize the independent variables
scaler = StandardScaler()
data[independent_vars] = scaler.fit_transform(data[independent_vars])

# Loop through each dependent variable and fit a regression model
for dependent_var in dependent_vars:
    # Standardize the dependent variable
    data[dependent_var] = scaler.fit_transform(data[[dependent_var]])
    # Define the formula for the regression model
    formula = f'{dependent_var} ~ year_2 + year_3 + year_4 + year_5 + year_6  + sex_2 + sex_3'

    # Fit the regression model
    model = ols(formula, data=data).fit()

    # Print the summary of the regression model
    print(f'Results for {dependent_var}:\n')
    print(model.summary())
    print('\n' + '-'*80 + '\n')

# Predict the values using the fitted model
data['predicted_empathy'] = model.fittedvalues


In [None]:
# Create dummy variables for 'year', with Bmed1 (year=1) as the reference category
data = pd.get_dummies(input, columns=['year'], drop_first=True)
# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# List of independent variables
independent_vars = ['year_2', 'year_3', 'year_4', 'year_5', 'year_6', 'sex']

# Standardize the independent variables
scaler = StandardScaler()
data[independent_vars] = scaler.fit_transform(data[independent_vars])

# Loop through each dependent variable and fit a regression model
for dependent_var in dependent_vars:
    # Standardize the dependent variable
    data[dependent_var] = scaler.fit_transform(data[[dependent_var]])
    # Define the formula for the regression model
    formula = f'{dependent_var} ~ year_2 + year_3 + year_4 + year_5 + year_6 + sex'

    # Fit the regression model
    model = ols(formula, data=data).fit()

    # Print the summary of the regression model
    print(f'Results for {dependent_var}:\n')
    print(model.summary())
    print('\n' + '-'*80 + '\n')

# Predict the values using the fitted model
data['predicted_empathy'] = model.fittedvalues


In [None]:
from statsmodels.regression.mixed_linear_model import MixedLM
df=input
Outcome = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]
model = MixedLM.from_formula("total_empathy ~ year + sex", groups="id", data=df)
result = model.fit()
print(result.summary())


Tried to emulate Table 2 of original paper. (Did not understand anything:)

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.anova import anova_lm

# Assume data is already loaded into a DataFrame called `data`
# Ensure 'year' column is correctly handled (create dummy variables if needed)
data = pd.get_dummies(input, columns=['year'], drop_first=True)

# List of independent variables
independent_vars = ['year_2', 'year_3', 'year_4', 'year_5', 'year_6', 'sex']

# Standardize the independent variables
scaler = StandardScaler()
data[independent_vars] = scaler.fit_transform(data[independent_vars])

# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# Store results
results = []

# Loop through each dependent variable
for dependent_var in dependent_vars:
    # Standardize the dependent variable
    data[dependent_var] = scaler.fit_transform(data[[dependent_var]])

    # Define the formula for the regression model
    formula = f'{dependent_var} ~ year_2 + year_3 + year_4 + year_5 + year_6 + sex '

    # Fit the regression model
    model = ols(formula, data=data).fit()
    print(model.summary())

    # Get standardized beta coefficients
    beta_coefficients = model.params
    se = model.bse

    # Perform ANOVA to calculate eta squared
    anova_results = anova_lm(model)
    eta_squared = anova_results['sum_sq'][:-1] / anova_results['sum_sq'].sum()

    # Store the results
    results.append({
        'Dependent Variable': dependent_var,
        'Beta Coefficients': beta_coefficients,
        'Standard Errors': se,
        'Eta Squared': eta_squared
    })

# Display the results
for result in results:
    print(f"Results for {result['Dependent Variable']}:")
    print(f"Beta Coefficients:\n{result['Beta Coefficients']}")
    print(f"Standard Errors:\n{result['Standard Errors']}")
    print(f"Eta Squared:\n{result['Eta Squared']}")
    print('\n' + '-'*80 + '\n')


In [None]:
# List of dependent variables
dependent_vars = [
    'total_empathy', 'cognitive_empathy', 'affective_empathy',
    'modify_presentation', 'emotion_recognition', 'depression',
    'anxiety', 'exhaustion', 'cynicism', 'efficacy'
]

# Run regressions for each dependent variable(y) where xi is independent variable
# y= ko + k1x1 + k2x2 + ...... + knxn + ϵ
for var in dependent_vars:
  model = sm.OLS(input[var], sm.add_constant(input[['year', 'sex']])).fit()
  print(f"\nResults for {var}:")
  print(model.summary())

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df=input
X1 = df[['year', 'sex']]

for var in dependent_vars:
  y = df[var]

  # Initialize the models
  model1 = LinearRegression().fit(X1, y)
  print(f'Model 1 Coefficients: {model1.coef_}, Intercept: {model1.intercept_}')

# **Linear regressions in which each of the empathy indicators are considered separately as independent variables with each of the mental health and burnout indicators as dependent variables**

In [None]:
import statsmodels.api as sm

# Define the indicators
empathy_indicators = ['total_empathy', 'cognitive_empathy', 'affective_empathy', 'modify_presentation', 'emotion_recognition']
mental_health_indicators = ['depression', 'anxiety', 'exhaustion', 'cynicism']
burnout_indicators = ['efficacy']
dependent_vars = mental_health_indicators + burnout_indicators

# Filter data for males
df_males = input[input['sex'] == 1]

# Filter data for females
df_females = input[input['sex'] == 2]

# Function to fit and summarize OLS models
def fit_and_summarize_ols(df, empathy_indicators, dependent_vars, sex_label):
    results = {}
    for ind in empathy_indicators:
        for var in dependent_vars:
            X = sm.add_constant(df[ind])
            y = df[var]
            model = sm.OLS(y, X).fit()
            results[(ind, var)] = model.params
            print(f"\nResults for {var} [Sex: {sex_label}, Empathy Indicator: {ind}]:")
            print(model.summary())
    return results

# Fit models for males and females
print("Fitting models for males...")
results_males = fit_and_summarize_ols(df_males, empathy_indicators, dependent_vars, 'Male')

print("\nFitting models for females...")
results_females = fit_and_summarize_ols(df_females, empathy_indicators, dependent_vars, 'Female')

# Comparing coefficients
comparison_results = []
for key in results_males:
    male_coeff = results_males[key]
    female_coeff = results_females[key]
    comparison_results.append((key[0], key[1], male_coeff[1], female_coeff[1]))

# Display comparison results
comparison_df = pd.DataFrame(comparison_results, columns=['Empathy Indicator', 'Dependent Variable', 'Male Coefficient', 'Female Coefficient'])
print("\nComparison of Coefficients:")
print(comparison_df)




In [None]:
def plot_regression(ax, df, dependent_var,ind):
    sns.scatterplot(ax=ax, x=ind, y=dependent_var, hue='sex', data=df, palette={1: "steelblue", 2: "violet", 3: "orange"})
    sns.regplot(ax=ax, x=ind, y=dependent_var, data=df[df['sex'] == 1], scatter=False, color="steelblue")
    sns.regplot(ax=ax, x=ind, y=dependent_var, data=df[df['sex'] == 2], scatter=False, color="violet")
    ax.set_title(f'{dependent_var.capitalize()} vs {ind.capitalize()}')
    ax.set_xlabel(ind.capitalize())
    ax.set_ylabel(dependent_var.capitalize())

# Create a 2x5 grid for plots
fig, axes = plt.subplots(5, 5, figsize=(20, 20))

# Flatten the axes array for easy iteration
axes = axes.flatten()


# Create regression plots in each subplot
for ax, (ind, var) in zip(axes, [(ind, var) for ind in empathy_indicators for var in dependent_vars]):
    plot_regression(ax, input, var, ind)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Filter data for males
df_males = input[input['sex'] == 1]

# Filter data for females
df_females = input[input['sex'] == 2]

# Function to calculate slopes for each combination
def calculate_slopes(df, empathy_indicators, dependent_vars, sex_label):
    slopes = []
    for ind in empathy_indicators:
        for var in dependent_vars:
            slope, intercept, r_value, p_value, std_err = linregress(df[ind], df[var])
            slopes.append((ind, var, slope))
            print(f"Slope for {var} [Sex: {sex_label}, Empathy Indicator: {ind}] : {slope}")
    return slopes

# Calculate slopes for males
print("Calculating slopes for males...")
slopes_males = calculate_slopes(df_males, empathy_indicators, dependent_vars, 'Male')

# Calculate slopes for females
print("\nCalculating slopes for females...")
slopes_females = calculate_slopes(df_females, empathy_indicators, dependent_vars, 'Female')

# Convert results to DataFrame for easier comparison
comparison_df = pd.DataFrame(slopes_males, columns=['Empathy Indicator', 'Dependent Variable', 'Male Slope'])
comparison_df['Female Slope'] = [slope[2] for slope in slopes_females]

print("\nComparison of Slopes:")
print(comparison_df)

In [None]:
import pandas as pd
from scipy import stats  # For the t-test
from statsmodels.formula.api import ols  # For the OLS model

data=input
total_missing = data.isnull().sum().sum()
print(f'Total missing values: {total_missing}')

# Calculate the means for both genders
male_empathy = data[data['sex'] == 'male']['total_empathy']
female_empathy = data[data['sex'] == 'female']['total_empathy']

# Paired t-test over time (pre and post data)
t_stat, p_value = stats.ttest_rel(male_empathy, female_empathy)
print(f'Total Empathy T-test: t-stat={t_stat}, p-value={p_value}')

# Alternatively, for multiple years
model = ols('total_empathy ~ C(year) + C(sex) + C(year):C(sex)', data=data).fit()
print(model.summary())
