### Imports

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../../src'))

from helper_functions.path_resolver import DynamicPathResolver
from model_training.meta import *

### Paths

In [None]:
dpr = DynamicPathResolver(marker="README.md")

train_preprocessed = dpr.path.data.preprocessed.data_meta.english_curated_train_meta_csv #own_train_base_meta_csv # german_curated_train_meta_csv
test_preprocessed  = dpr.path.data.preprocessed.data_meta.english_curated_test_meta_csv #own_test_base_meta_csv # german_curated_test_meta_csv

models_folder = dpr.path.models.meta._path
output_dir = dpr.path.models.meta.results._path

### Prepare data

In [None]:
set_seed(42)

##### Load data

In [None]:
df_train, df_test = load_preprocessed_data(train_preprocessed, test_preprocessed)

In [None]:
print("\n=== Updated Class Distribution (Train) ===")
print(df_train['label'].value_counts(), "\n")

print("\n=== Updated Class Distribution (Test) ===")
print(df_test['label'].value_counts(), "\n")


##### Remove extra columns

In [None]:
df_train = df_train.drop(columns=['readability_score', 'word_count'], errors='ignore')

In [None]:
df_test = df_test.drop(columns=['readability_score', 'word_count'], errors='ignore')

In [None]:
print("=== df_train (own dataset) describe ===")
print(df_train.describe())

##### Train, Val Split

In [None]:
X = df_train.drop(columns=['label'])
y = df_train['label']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
y_test = df_test['label']

##### Load vectorizer

In [None]:
vectorizer_path = os.path.join(models_folder, "", "tfidf_vectorizer.pkl")
fitted_vectorizer = joblib.load(vectorizer_path)

##### Unscaled preprocessed

In [None]:
X_train_combined, y_train, fitted_vectorizer = preprocess_dataframe(
    pd.concat([X_train, y_train], axis=1), 
    vectorizer=None, 
    fit_vectorizer=True, 
    scale=False
    )

In [None]:
X_val_combined, y_val = preprocess_dataframe(
    pd.concat([X_val, y_val], axis=1), 
    vectorizer=fitted_vectorizer, 
    fit_vectorizer=False, 
    scale=False
    )

In [None]:
X_test_combined, y_test = preprocess_dataframe(
    df_test, 
    vectorizer=fitted_vectorizer, 
    fit_vectorizer=False, 
    scale=False
    )

In [None]:
print(f"Final Train Shape: {X_train_combined.shape}")
print(f"Final Val Shape: {X_val_combined.shape}")
print(f"Final Test Shape: {X_test_combined.shape}")

##### Scaled preprocessed

In [None]:
X_train_combined_scaled, y_train_scaled, _ = preprocess_dataframe(
    pd.concat([X_train, y_train], axis=1), 
    vectorizer=None, 
    fit_vectorizer=True, 
    scale=True
    )

In [None]:
X_val_combined_scaled, y_val_scaled = preprocess_dataframe(
    pd.concat([X_val, y_val], axis=1), 
    vectorizer=fitted_vectorizer, 
    fit_vectorizer=False, 
    scale=True
    )

In [None]:
X_test_combined_scaled, y_test_scaled = preprocess_dataframe(
    df_test, 
    vectorizer=fitted_vectorizer, 
    fit_vectorizer=False, 
    scale=True
    )

In [None]:
print(f"Final Train Shape: {X_train_combined_scaled.shape}")
print(f"Final Val Shape: {X_val_combined_scaled.shape}")
print(f"Final Test Shape: {X_test_combined_scaled.shape}")

##### Save vectorizer

In [None]:
#joblib.dump(fitted_vectorizer, os.path.join(models_folder, "tfidf_vectorizer.pkl"))

### Train

##### Define models for Grid Search

In [None]:
model_names = ["naive_bayes", "log_regression", "decision_tree", "random_forest"]

In [None]:
models = {
    "naive_bayes":      (MultinomialNB(), {
        'alpha': [0.01, 0.1, 0.5, 1, 2, 5]
        }),

    "log_regression":   (LogisticRegression(max_iter=500), {
        'C': [0.01, 0.1, 1, 10, 100], 
        'solver': ['liblinear', 'saga'], 
        'max_iter': [200, 500, 1000]
        }),

    "decision_tree":    (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30], 
        'min_samples_split': [2, 5, 10], 
        'min_samples_leaf': [1, 2, 4], 
        'criterion': ['gini', 'entropy']
        }),

    "random_forest":    (RandomForestClassifier(), {
        'n_estimators': [100, 200, 500], 
        'max_depth': [None, 10, 20, 30], 
        'min_samples_split': [2, 5, 10], 
        'min_samples_leaf': [1, 2, 4]
        })
}

##### Train models

In [None]:
train_models = False

In [None]:
if train_models:
    trained_models = {}

    for model_name, (model, param_grid) in models.items():
        scale = model_name in ["log_regression"]
        trained_models[model_name] = train_and_save_model(
            model, 
            param_grid, 
            X_train_combined_scaled if scale else X_train_combined, 
            y_train, 
            model_name, 
            models_folder, 
            scale_features=scale
            )

##### Train Ensemble

In [None]:
train_ensemble = False

In [None]:
if train_ensemble:
    ensemble_model = create_and_train_ensemble(models_folder, model_names)
    ensemble_model.fit(X_train_combined, y_train)
    save_model(ensemble_model, models_folder, "ensemble_model")

### Evaluate

In [None]:
eval_scaled_x = X_test_combined_scaled   # X_val_combined_scaled # X_test_combined_scaled 
eval_scaled_y = y_test_scaled            # y_val_scaled          # y_test_scaled
 
eval_unscaled_x = X_test_combined        # X_val_combined # X_test_combined
eval_unscaled_y = y_test                 # y_val          # y_test

##### Evaluate model (scaled)

In [None]:
model = load_model(models_folder, 'log_regression')
y_pred, acc, report_df, conf_matrix = evaluate_model(model, eval_scaled_x, eval_scaled_y, model_name='log_regression', output_folder=output_dir)

##### Evaluate models (unscaled)

In [None]:
model_names = ["naive_bayes", "decision_tree", "random_forest"]

In [None]:
for model_name in model_names:
    model = load_model(models_folder, model_name)
    evaluate_model(model, eval_unscaled_x, eval_unscaled_y, model_name=model_name, output_folder=output_dir)

##### Evaluate ensemble

In [None]:
ensemble_model = load_model(models_folder, "ensemble_model")
y_pred, acc, report_df, conf_matrix = evaluate_model(ensemble_model, eval_unscaled_x, eval_unscaled_y, model_name="ensemble_model", output_folder=output_dir)

### Feature Distribution

In [None]:
X_test_numerical_full = df_test.drop(columns=['label', 'body'], errors='ignore')

In [None]:
analyze_feature_distributions(
    X_val.drop(columns=['body']), 
    X_test_numerical_full, 
    fitted_vectorizer.transform(X_val['body']), 
    fitted_vectorizer.transform(df_test['body'])
    )

feature_distribution_dir = os.path.join(output_dir, 'feature_distribution')
os.makedirs(feature_distribution_dir, exist_ok=True)
save_feature_distributions(X_val.drop(columns=['body'], errors='ignore'), X_test_numerical_full, feature_distribution_dir)