In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load and merge data
base_dir = "output_lexical_metrics"
mmse_file = "mmse_input.csv"
lexical_data = []
filenames = []

for folder in ['cc', 'cd']:
    path = f"{base_dir}/{folder}"
    for fname in os.listdir(path):
        if fname.endswith(".csv"):
            df = pd.read_csv(os.path.join(path, fname))
            lexical_data.append(df.iloc[0])
            filenames.append(os.path.splitext(fname)[0])

lex_df = pd.DataFrame(lexical_data)
lex_df['filename'] = filenames

mmse_df = pd.read_csv(mmse_file)
mmse_df['filename'] = mmse_df['filename'].apply(lambda x: os.path.splitext(x)[0])

merged_df = pd.merge(lex_df, mmse_df, on='filename', how='left')
print("After merge:", merged_df.shape, ", missing MMSE:", merged_df['mmse_score'].isna().sum())

# Preprocessing
X = merged_df.drop(columns=['filename', 'mmse_score'])
y = merged_df['mmse_score'].fillna(merged_df['mmse_score'].mean())

imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Ridge regression with cross-validated alpha
alphas = np.logspace(-3, 3, 25)
ridge_model = Ridge()
ridge_cv = GridSearchCV(
    ridge_model,
    param_grid={'alpha': alphas},
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)
ridge_cv.fit(X_train_s, y_train)

best_ridge = ridge_cv.best_estimator_
y_pred = best_ridge.predict(X_test_s)

# Evaluation
print("Best alpha:", ridge_cv.best_params_['alpha'])
print("R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

After merge: (108, 58) , missing MMSE: 1
Best alpha: 177.82794100389228
R²: 0.12271679681453007
MAE: 4.893224367985052
RMSE: 5.660297041877386
