In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# — Paths —
base_dir = 'output_lexical_metrics'
mmse_path = 'mmse_input.csv'

# Load lexical features
print("Loading lexical metrics CSVs...")
data_rows, filenames = [], []
for group in ['cc', 'cd']:
    folder = os.path.join(base_dir, group)
    for fname in os.listdir(folder):
        if fname.endswith('.csv'):
            df = pd.read_csv(os.path.join(folder, fname))
            data_rows.append(df.iloc[0])
            filenames.append(os.path.splitext(fname)[0])  # remove .csv

lex_df = pd.DataFrame(data_rows)
lex_df['filename'] = filenames
print(f"Lexical samples: {len(lex_df)}")

# Load MMSE data
mmse_df = pd.read_csv(mmse_path)
mmse_df['filename'] = mmse_df['filename'].apply(lambda x: os.path.splitext(x)[0])

# Merge on filename
merged = pd.merge(lex_df, mmse_df, on='filename', how='left')
print(f"\nAfter merge: {merged.shape}, missing MMSE: {merged['mmse_score'].isna().sum()}")

# Drop rows with missing MMSE
merged = merged.dropna(subset=['mmse_score'])

# Prepare data
X = merged.drop(columns=['filename', 'mmse_score'])
y = merged['mmse_score']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imp = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# LassoCV
print("\nTraining LassoCV...")
lasso_cv = LassoCV(cv=5, alphas=np.logspace(-3, 1, 50), random_state=42, max_iter=10000)
lasso_cv.fit(X_train_s, y_train)
alpha = lasso_cv.alpha_
print(f"Best alpha: {alpha:.5f}")

# Final Lasso model
lasso = Lasso(alpha=alpha, max_iter=5000)
lasso.fit(X_train_s, y_train)

# Get selected features
nonzero_mask = np.abs(lasso.coef_) > 1e-6
selected_features = X.columns[nonzero_mask].tolist()
print(f"Selected {len(selected_features)} features: {selected_features}")

# Predict and evaluate
y_pred = lasso.predict(X_test_s)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nEvaluation with Lasso:")
print(f" R²   = {r2:.4f}")
print(f" MAE  = {mae:.2f}")
print(f" RMSE = {rmse:.2f}")

Loading lexical metrics CSVs...
Lexical samples: 108

After merge: (108, 58), missing MMSE: 1

Training LassoCV...


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Best alpha: 0.40949
Selected 19 features: ['all_freq_log_mean', 'all_cd_log_mean', 'all_mean_cos_mean', 'all_BNC_wordcount_mean', 'all_BNC_freq_mean', 'all_phonemes_mean', 'content_cd_log_mean', 'noun_freq_raw_mean', 'noun_cd_raw_mean', 'noun_concreteness_sd_mean', 'noun_mean_cos_mean', 'noun_BNC_wordcount_mean', 'noun_phonemes_mean', 'POS_CCONJ_per100', 'POS_AUX_per100', 'POS_ADJ_per100', 'POS_SCONJ_per100', 'POS_ADV_per100', 'POS_X_per100']

Evaluation with Lasso:
 R²   = 0.1462
 MAE  = 5.61
 RMSE = 6.59


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
