In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_excel("../data/Data Responden Artikel Ilmiah Callista dan Alexandra.xlsx")

df.head()
df.info()

In [None]:
#drop name column if present
df = df.drop(columns=["Nama "], errors="ignore")

#check missing values
df.isnull().sum()

#clean column names
df.columns = df.columns.str.strip()

df.info()

In [None]:
#encode demographic variables
df['Gender'] = df['Jenis Kelamin'].map({'Laki-laki': 0, 'Perempuan': 1})

df['Education_Level'] = df['Jenjang Pendidikan'].map({
    'SMA/SMK': 0,
    'Mahasiswa': 1,
})

#rename usia -> age
df = df.rename(columns={"Usia": "Age"}) 

In [None]:
#Q1-6
exposure_cols = [
    col for col in df.columns
    if col.startswith(tuple(f"{i}." for i in range(1, 7)))
]

#Q7-18 (attention-related difficulties NOT attention span)
attention_cols = [
    col for col in df.columns
    if col.startswith(tuple(f"{i}." for i in range(7, 19)))
]

#Q19-25 (academic functioning NOT overall academic performance)
academic_cols = [
    col for col in df.columns
    if col.startswith(tuple(f"{i}." for i in range(19, 26)))
]

len(exposure_cols), len(attention_cols), len(academic_cols)

In [None]:

#index construction
df["Exposure_Index"] = df[exposure_cols].mean(axis=1)
df["Attention_Difficulty_Index"] = df[attention_cols].mean(axis=1)
df["Academic_Functioning_Index"] = df[academic_cols].mean(axis=1)

df[["Exposure_Index", "Attention_Difficulty_Index", "Academic_Functioning_Index"]].describe()

In [None]:
#index distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sns.histplot(df["Exposure_Index"], kde=True, ax=axes[0])
axes[0].set_title("Exposure Index Distribution")

sns.histplot(df["Attention_Difficulty_Index"], kde=True, ax=axes[1])
axes[1].set_title("Attention Difficulty Index Distribution")

sns.histplot(df["Academic_Functioning_Index"], kde=True, ax=axes[2])
axes[2].set_title("Academic Functioning Index Distribution")

plt.tight_layout()
plt.show()

In [None]:
#correlation analysis
corr = df[[
    "Exposure_Index",
    "Attention_Difficulty_Index",
    "Academic_Functioning_Index"
]].corr()

sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Between Exposure, Attention Difficulty, and Academic Functioning")
plt.show()

In [None]:
#regression models
#model a: attention difficulty index ~ exposure index
X = sm.add_constant(df["Exposure_Index"])
y = df["Attention_Difficulty_Index"]
model_a = sm.OLS(y, X).fit()
print(model_a.summary())

#model b: academic functioning index ~ exposure index
X = sm.add_constant(df["Exposure_Index"])
y = df["Academic_Functioning_Index"]
model_b = sm.OLS(y, X).fit()
print(model_b.summary())

#model c: academic functioning index ~ attention difficulty index
X = sm.add_constant(df["Attention_Difficulty_Index"])
y = df["Academic_Functioning_Index"]
model_c = sm.OLS(y, X).fit()
print(model_c.summary())

#model d: academic functioning index ~ exposure index + attention difficulty index
X = sm.add_constant(
    df[["Exposure_Index", "Attention_Difficulty_Index"]]
)
y = df["Academic_Functioning_Index"]
model_d = sm.OLS(y, X).fit()
print(model_d.summary())

In [None]:
#predictive modeling
features = [
    "Exposure_Index",
    "Attention_Difficulty_Index",
    "Age",
    "Education_Level",
    "Gender"
]

X = df[features]
y = df["Academic_Functioning_Index"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

#linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

#random forest
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

#gradient boosting regressor
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr.fit(X_train, y_train)
gbr_preds = gbr.predict(X_test)

gbr_r2 = r2_score(y_test, gbr_preds)
gbr_rmse = np.sqrt(mean_squared_error(y_test, gbr_preds))

#xgboost
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

xgb_r2 = r2_score(y_test, xgb_preds)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))

results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Gradient Boosting", "XGBoost"],
    "Test RÂ²": [
        r2_score(y_test, lr_preds),
        r2_score(y_test, rf_preds),
        gbr_r2,
        xgb_r2
    ],
    "Test RMSE": [
        np.sqrt(mean_squared_error(y_test, lr_preds)),
        np.sqrt(mean_squared_error(y_test, rf_preds)),
        gbr_rmse,
        xgb_rmse
    ]
})

results

In [None]:
#feature importance
importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

sns.barplot(
    x="Importance",
    y="Feature",
    data=importance_df
)
plt.title("Random Forest Feature Importance (Exploratory)")
plt.show()

In [None]:
df.to_csv("../data/processed.csv", index=False)

In [None]:
pls_df = df[
    exposure_cols +
    attention_cols +
    academic_cols +
    ["Age", "Education_Level", "Gender"]
]

pls_df.to_csv("../data/pls_data.csv", index=False)

df_pls = pls_df.sample(n=100, random_state=42)
df_pls.to_csv("../data/pls_data_100.csv", index=False)