In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Configure your data directory
base_dir = 'output_lexical_metrics'

# 2. Verify directory structure
print(f"Checking data directory: {base_dir}")
if not os.path.isdir(base_dir):
    raise FileNotFoundError(f"Data directory '{base_dir}' does not exist.")
for subgroup in ['cc', 'cd']:
    subgroup_path = os.path.join(base_dir, subgroup)
    print(f" - Subfolder '{subgroup_path}': {'FOUND' if os.path.isdir(subgroup_path) else 'MISSING'}")
    if not os.path.isdir(subgroup_path):
        raise FileNotFoundError(f"Expected subfolder '{subgroup_path}' not found.")

# 3. Load CSV metrics and labels
print("\nLoading CSV files...")
data_rows, labels = [], []
for label in ['cc', 'cd']:
    folder = os.path.join(base_dir, label)
    files = [f for f in os.listdir(folder) if f.lower().endswith('.csv')]
    print(f" * {len(files)} files in '{label}'")
    for fname in files:
        df = pd.read_csv(os.path.join(folder, fname))
        data_rows.append(df.iloc[0])
        labels.append(label)

# 4. Build DataFrame
df = pd.DataFrame(data_rows)
df['label'] = labels
print(f"\nTotal samples: {len(df)} (cc={df.label.value_counts()['cc']}, cd={df.label.value_counts()['cd']})")

# 5. Separate features/target and check for NaNs
X = df.drop(columns=['label'])
y = df['label'].map({'cc': 0, 'cd': 1})

print("\nMissing values per feature:")
missing = X.isnull().sum()
print(missing[missing > 0] if missing.any() else "No missing values!")

# 6. Impute missing values with mean
if missing.any():
    print("\nImputing missing values using column means...")
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    print("Imputation complete. Any missing now:", X.isnull().sum().sum() == 0)

# 7. Train/test split
print("\nSplitting data (80% train, 20% test)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f" Training samples: {X_train.shape[0]}")
print(f" Test samples:     {X_test.shape[0]}")

# 8. Initialize and train Random Forest
print("\nInitializing RandomForestClassifier...")
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1
)

print("Starting model training...")
rf.fit(X_train, y_train)
print("Training finished.")

# 9. Evaluate
print("\nEvaluating on test set...")
y_pred = rf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 10. Feature importances (top 10)
importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)
print("\nTop 10 feature importances:")
print(top_features)

Checking data directory: output_lexical_metrics
 - Subfolder 'output_lexical_metrics/cc': FOUND
 - Subfolder 'output_lexical_metrics/cd': FOUND

Loading CSV files...
 * 54 files in 'cc'
 * 54 files in 'cd'

Total samples: 108 (cc=54, cd=54)

Missing values per feature:
all_freq_raw_mean                 1
all_freq_log_mean                 1
all_cd_raw_mean                   1
all_cd_log_mean                   1
all_concreteness_m_mean           1
all_concreteness_sd_mean          1
all_mean_cos_mean                 1
all_SemD_mean                     1
all_BNC_wordcount_mean            1
all_BNC_contexts_mean             1
all_BNC_freq_mean                 1
all_lg_BNC_freq_mean              1
all_phonemes_mean                 1
content_freq_raw_mean             1
content_freq_log_mean             1
content_cd_raw_mean               1
content_cd_log_mean               1
content_concreteness_m_mean       1
content_concreteness_sd_mean      1
content_mean_cos_mean             1
content_Se