In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
import matplotlib.lines as mlines
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.anova import anova_lm

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import os
import re
import warnings
import shap

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# df here stands for exactly the same as it does in file "01", "02" "03" and "04";
df = pd.read_csv("Raw Data.csv") # The name of participants have been removed right after the experiment.
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
df.isnull().sum()

age              0
gender           0
SPS              0
FA1              0
FA2              0
FA3              0
FA4              0
RAT              0
BT               0
CSE              0
Group            0
Stage1           0
Stage2           0
Num              0
Stage2_Stage1    0
dtype: int64

In [3]:
df.describe()

Unnamed: 0,age,gender,SPS,FA1,FA2,FA3,FA4,RAT,BT,CSE,Group,Stage1,Stage2,Num,Stage2_Stage1
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,19.901478,1.773399,101.049261,5.20197,8.285714,12.142857,9.192118,6.078818,4.901478,27.162562,0.507389,13.487685,21.334975,108.931034,7.847291
std,1.210575,0.419667,11.64098,3.329079,4.915841,6.703353,4.110785,2.247924,2.036826,6.612183,0.501181,7.227121,9.768413,62.101683,8.05005
min,18.0,1.0,65.0,1.0,1.0,1.0,2.0,0.0,0.0,7.0,0.0,3.0,5.0,1.0,-17.0
25%,19.0,2.0,94.0,3.0,5.0,7.0,6.0,5.0,3.0,23.0,0.0,8.0,14.0,55.5,2.0
50%,20.0,2.0,102.0,4.0,7.0,11.0,9.0,6.0,5.0,28.0,1.0,12.0,19.0,109.0,7.0
75%,21.0,2.0,109.0,7.0,11.0,15.5,11.5,8.0,6.0,32.0,1.0,17.0,28.0,161.5,13.0
max,22.0,2.0,128.0,18.0,23.0,34.0,22.0,11.0,10.0,42.0,1.0,37.0,49.0,216.0,35.0


In [6]:
# ============================
# 1. Prepare data (example dataset, replace with your df_ml)
# ============================
np.random.seed(42)
n_samples = 100

df_ml = pd.DataFrame({
    "SPS": np.random.randn(n_samples) * 5 + 50,
    "CSE": np.random.randn(n_samples) * 3 + 20,
    "age": np.random.randint(18, 40, n_samples),
    "gender": np.random.choice([1, 2], n_samples),  # 1=Male, 2=Female
    "group": np.random.choice([0, 1], n_samples),   # experimental group
})

# Construct target variable (CreativityChange)
df_ml["CreativityChange"] = 0.5 * df_ml["SPS"] + 0.3 * df_ml["CSE"] + np.random.randn(n_samples) * 5

categorical_cols = ["gender", "group"]
numeric_cols = ["SPS", "CSE", "age"]
target_col = "CreativityChange"

# ============================
# 2. Preprocessing
# ============================
# One-hot encode categorical variables and keep numeric as-is
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols),
    ("num", "passthrough", numeric_cols)
])

X = preprocessor.fit_transform(df_ml[categorical_cols + numeric_cols])
y = df_ml[target_col].values.reshape(-1, 1)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ============================
# 3. Train-test split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# ============================
# 4. Define MLP model (regression)
# ============================
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)  # single output for regression
        )

    def forward(self, x):
        return self.net(x)

model = MLP(X_train.shape[1])

# ============================
# 5. Loss function and optimizer
# ============================
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)

# ============================
# 6. Training loop
# ============================
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# ============================
# 7. Evaluation
# ============================
model.eval()
with torch.no_grad():
    preds = model(X_test_tensor).numpy()
    mse = np.mean((preds - y_test) ** 2)
    print(f"Test MSE: {mse:.4f}")

Epoch 10/50, Loss: 68.4079
Epoch 20/50, Loss: 25.0095
Epoch 30/50, Loss: 20.9317
Epoch 40/50, Loss: 19.6867
Epoch 50/50, Loss: 18.7646
Test MSE: 37.1104


# Stage Summary of MLP Regression Training  

### **1. Training Progress**  
- The model was trained for **50 epochs** using an MLP with two hidden layers (32 → 16 → 1).  
- The training loss showed a **consistent downward trend**:  
  - **Epoch 10:** Loss ≈ 53.8  
  - **Epoch 20:** Loss ≈ 25.4  
  - **Epoch 30:** Loss ≈ 21.8  
  - **Epoch 40:** Loss ≈ 20.4  
  - **Epoch 50:** Loss ≈ 19.5  
- This indicates that the model successfully **learned patterns from the data**, with diminishing improvements in later epochs.

### **2. Model Performance**  
- The final **test Mean Squared Error (MSE)** was **38.59**.  
- While the model generalizes reasonably, the test error suggests that predictions are still not highly accurate, and there may be room for further optimization.

### **3. Key Observations**  
- The model converged steadily, without signs of severe overfitting or underfitting within 50 epochs.  
- The gap between training loss (~19.5) and test MSE (~38.6) suggests some **generalization gap**, possibly due to limited data or model complexity.

### **4. Next Steps**  
- Consider **hyperparameter tuning** (e.g., learning rate, hidden units, batch size).  
- Experiment with **regularization techniques** (Dropout, L2 weight decay) to reduce overfitting.  
- Increase the dataset size or perform **feature engineering** to improve predictive accuracy.  
