In [None]:
# day 1 - data handling & cleaning

In [None]:
#imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import os

In [None]:
#to check directory
os.getcwd()

In [None]:
df = pd.read_excel("../data/Data Responden Artikel Ilmiah Callista dan Alexandra.xlsx")
df.head()

In [None]:
df.info()

In [None]:
list(df.columns)

In [None]:
#freezing data (for backup)
df.to_csv("../data/raw_data_backup.csv", index=False)

In [None]:
df = df.drop(columns=["Nama "], errors="ignore")

if "Jenis Kelamin" in df.columns:
    df["Jenis Kelamin"] = df["Jenis Kelamin"].astype(str).str.strip().str.capitalize()

In [None]:
#check missing values
df.isnull().sum().sort_values(ascending=False)

In [None]:
#save day 1 dataset
df.to_csv("../data/day1.csv", index=False)

In [None]:
#quick sanity EDA (age distribution) [NOT FOR PAPER]
sns.histplot(df['Usia'], kde=True)
plt.title("Age Distribution (Sanity Check)")
plt.show()

In [None]:
# day 2 - index construction & core analysis

In [None]:
df = pd.read_csv("../data/day1.csv")

#normalize column names
df.columns = df.columns.str.strip()

df.head()

In [None]:
#exposure: Q1–Q6
exposure_cols = [col for col in df.columns if col.startswith(tuple(f"{i}." for i in range(1, 7)))]

#attention: Q7–Q18
attention_cols = [col for col in df.columns if col.startswith(tuple(f"{i}." for i in range(7, 19)))]

#academic: Q19–Q25
academic_cols = [col for col in df.columns if col.startswith(tuple(f"{i}." for i in range(19, 26)))]

#sanity check
len(exposure_cols), len(attention_cols), len(academic_cols)

In [None]:
df['Exposure_Index'] = df[exposure_cols].mean(axis=1)
df['Attention_Index'] = df[attention_cols].mean(axis=1)
df['Academic_Index'] = df[academic_cols].mean(axis=1)

df[['Exposure_Index', 'Attention_Index', 'Academic_Index']].describe()

In [None]:
#index distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sns.histplot(df['Exposure_Index'], kde=True, ax=axes[0])
axes[0].set_title("Exposure Index")

sns.histplot(df['Attention_Index'], kde=True, ax=axes[1])
axes[1].set_title("Attention Index")

sns.histplot(df['Academic_Index'], kde=True, ax=axes[2])
axes[2].set_title("Academic Performance Index")

plt.tight_layout()
plt.show()

In [None]:
#correlation analysis
corr = df[['Exposure_Index', 'Attention_Index', 'Academic_Index']].corr()

sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Between Exposure, Attention, and Academic Performance")
plt.show()

corr

In [None]:
#mediation regression
#model a: exposure -> attention
X = sm.add_constant(df['Exposure_Index'])
y = df['Attention_Index']
model_a = sm.OLS(y, X).fit()
print(model_a.summary())

#model b: exposure -> academic
X = sm.add_constant(df['Exposure_Index'])
y = df['Academic_Index']
model_b = sm.OLS(y, X).fit()
print(model_b.summary())

#model c: attention -> academic
X = sm.add_constant(df['Attention_Index'])
y = df['Academic_Index']
model_c = sm.OLS(y, X).fit()
print(model_c.summary())

#model d: exposure + attention -> academic
X = sm.add_constant(df[['Exposure_Index', 'Attention_Index']])
y = df['Academic_Index']
model_d = sm.OLS(y, X).fit()
print(model_d.summary())

In [None]:
#adding control variables (NEW)

#encode education levels numerically
df['Education_Num'] = df['Jenjang Pendidikan'].map({
    'SMA/SMK': 0,
    'Mahasiswa': 1
})

X = sm.add_constant(
    df[['Exposure_Index', 'Attention_Index', 'Usia', 'Education_Num']]
)
y = df['Academic_Index']

model_control = sm.OLS(y, X).fit()
print(model_control.summary())

In [None]:
#standardize (z-score) robustness check (NEW)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Exposure_z', 'Attention_z', 'Academic_z']] = scaler.fit_transform(
    df[['Exposure_Index', 'Attention_Index', 'Academic_Index']]
)

X = sm.add_constant(df[['Exposure_z', 'Attention_z']])
y = df['Academic_z']

model_standardized = sm.OLS(y, X).fit()
print(model_standardized.summary())

In [None]:
#regression visuals (NEW)
scaler = StandardScaler()
df[['Exposure_z', 'Attention_z', 'Academic_z']] = scaler.fit_transform(
    df[['Exposure_Index', 'Attention_Index', 'Academic_Index']]
)

X = sm.add_constant(df[['Exposure_z', 'Attention_z']])
y = df['Academic_z']

model_standardized = sm.OLS(y, X).fit()
print(model_standardized.summary())

In [None]:
#cronbach’s alpha (reliability analysis)
def cronbach_alpha(df_items):
    items = df_items.to_numpy()
    item_variances = items.var(axis=0, ddof=1)
    total_variance = items.sum(axis=1).var(ddof=1)
    n_items = items.shape[1]
    return (n_items / (n_items - 1)) * (1 - item_variances.sum() / total_variance)

print("Exposure α:", cronbach_alpha(df[exposure_cols]))
print("Attention α:", cronbach_alpha(df[attention_cols]))
print("Academic α:", cronbach_alpha(df[academic_cols]))

In [None]:
#save day2 dataset
df.to_csv("../data/day2.csv", index=False)

In [None]:
#final outputs
summary_table = pd.DataFrame({
    'Model': [
        'Exposure → Attention',
        'Exposure → Academic',
        'Attention → Academic',
        'Mediation (no controls)',
        'Mediation (with controls)'
    ],
    'R_squared': [
        model_a.rsquared,
        model_b.rsquared,
        model_c.rsquared,
        model_d.rsquared,
        model_control.rsquared
    ]
})

summary_table

In [None]:
summary_table.to_csv("../data/model_summary_table.csv", index=False)

In [None]:
# day 3 - predictive modeling & evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [None]:
features = [
    "Exposure_Index",
    "Attention_Index",
    "Usia",
    "Education_Num"
]

#continuous target (regression)
target_reg = "Academic_Index"

X = df[features]
y = df[target_reg]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

In [None]:
#regression model
reg_models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=200,
        random_state=42
    )
}

reg_results = []

for name, model in reg_models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    reg_results.append({
        "Model": name,
        "Test_R2": r2_score(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds))
    })

reg_results_df = pd.DataFrame(reg_results)
print("Regression Model Performance:")
display(reg_results_df)

In [None]:
#classification target

#academic performance risk levels (low / medium / high)
df["Academic_Level"] = pd.qcut(
    df["Academic_Index"],
    q=3,
    labels=["Low", "Medium", "High"]
)

X_cls = df[features]
y_cls = df["Academic_Level"]

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_cls,
    y_cls,
    test_size=0.25,
    random_state=42,
    stratify=y_cls
)

In [None]:
#classification model
cls_models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        multi_class="multinomial"
    ),
    "Random Forest Classifier": RandomForestClassifier(
        n_estimators=300,
        random_state=42
    )
}

cls_results = []

for name, model in cls_models.items():
    model.fit(Xc_train, yc_train)
    preds = model.predict(Xc_test)

    cls_results.append({
        "Model": name,
        "Accuracy": accuracy_score(yc_test, preds)
    })

    print(f"\n{name} — Classification Report")
    print(classification_report(yc_test, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(yc_test, preds))

cls_results_df = pd.DataFrame(cls_results)
print("\nClassification Model Performance:")
display(cls_results_df)

In [None]:
#feature importance
rf_classifier = cls_models["Random Forest Classifier"]

feature_importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": rf_classifier.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Random Forest Feature Importance:")
display(feature_importance_df)

In [None]:
reg_results_df.to_csv("../data/day3_regression_results.csv", index=False)
cls_results_df.to_csv("../data/day3_classification_results.csv", index=False)
feature_importance_df.to_csv("../data/day3_feature_importance.csv", index=False)

In [None]:
#day 4 - graphs

In [None]:
day2 = pd.read_csv("../data/day2.csv")
print("Shape:", day2.shape)
print("Columns:", list(day2.columns))

In [None]:
def find_col(df, keywords):
    for col in df.columns:
        name = col.lower()
        if all(k in name for k in keywords):
            return col
    return None

exposure_col = find_col(day2, ["exposure"])
attention_col = find_col(day2, ["attention"])
academic_col = find_col(day2, ["academic"])

print("Detected index columns:")
print("Exposure:", exposure_col)
print("Attention:", attention_col)
print("Academic:", academic_col)

day2 = day2.rename(columns={
    exposure_col: "Exposure_Index",
    attention_col: "Attention_Index",
    academic_col: "Academic_Index"
})

In [None]:
plt.figure()
plt.hist(
    day2["Exposure_Index"],
    bins=10,
    edgecolor="black",
    linewidth=0.8
)
plt.xlabel("Exposure Index")
plt.ylabel("Frequency")
plt.title("Distribution of Short-Form Video Exposure")
plt.tight_layout()
plt.savefig("../figures/fig_exposure_distribution.png", dpi=300)
plt.show()

In [None]:
plt.figure()
plt.hist(
    day2["Attention_Index"],
    bins=10,
    edgecolor="black",
    linewidth=0.8
)
plt.xlabel("Attention Span Index (higher = lower attention)")
plt.ylabel("Frequency")
plt.title("Distribution of Attention Span (Inverse Index)")
plt.tight_layout()
plt.savefig("../figures/fig_attention_distribution.png", dpi=300)
plt.show()

In [None]:
plt.figure()
plt.hist(
    day2["Academic_Index"],
    bins=10,
    edgecolor="black",
    linewidth=0.8
)
plt.xlabel("Academic Performance Index")
plt.ylabel("Frequency")
plt.title("Distribution of Academic Performance")
plt.tight_layout()
plt.savefig("../figures/fig_academic_distribution.png", dpi=300)
plt.show()

In [None]:
plt.figure()
sns.regplot(
    x=day2["Exposure_Index"],
    y=day2["Attention_Index"],
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "black", "linewidth": 1}
)
plt.xlabel("Exposure Index")
plt.ylabel("Attention Span Index (higher = lower attention)")
plt.title("Relationship Between Short-Form Video Exposure and Attention Span")
plt.tight_layout()
plt.savefig("../figures/fig_exposure_attention.png", dpi=300)
plt.show()

In [None]:
plt.figure()
sns.regplot(
    x=day2["Attention_Index"],
    y=day2["Academic_Index"],
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "black", "linewidth": 1}
)
plt.xlabel("Attention Span Index (higher = lower attention)")
plt.ylabel("Academic Performance Index")
plt.title("Relationship Between Attention Span and Academic Performance")
plt.tight_layout()
plt.savefig("../figures/fig_attention_academic.png", dpi=300)
plt.show()

In [None]:
feat_imp = pd.read_csv("../data/day3_feature_importance.csv")

print("Feature importance columns:", list(feat_imp.columns))

def find_feat_col(df):
    for col in df.columns:
        if "feature" in col.lower() or "variable" in col.lower():
            return col
    return None

def find_imp_col(df):
    for col in df.columns:
        name = col.lower()
        if "importance" in name or "score" in name or "gain" in name:
            return col
    return None

feature_col = find_feat_col(feat_imp)
importance_col = find_imp_col(feat_imp)

if not feature_col or not importance_col:
    raise ValueError("could not detect feature/importance columns")

feat_imp = feat_imp.rename(columns={
    feature_col: "Feature",
    importance_col: "Importance"
})

feat_imp = feat_imp.sort_values(by="Importance", ascending=True)

plt.figure()
plt.barh(
    feat_imp["Feature"],
    feat_imp["Importance"],
    edgecolor="black"
)
plt.xlabel("Feature Importance Score")
plt.title("Feature Importance for Academic Performance Prediction")
plt.tight_layout()
plt.savefig("../figures/fig_feature_importance.png", dpi=300)
plt.show()


In [None]:
#optional
plt.figure()
sns.regplot(
    x=day2["Exposure_Index"],
    y=day2["Academic_Index"],
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "black", "linewidth": 1}
)
plt.xlabel("Exposure Index")
plt.ylabel("Academic Performance Index")
plt.title("Direct Relationship Between Exposure and Academic Performance")
plt.tight_layout()
plt.savefig("../figures/fig_exposure_academic.png", dpi=300)
plt.show()