# Baseline Model Development\nThis notebook develops baseline models and establishes performance benchmarks.

In [None]:
import os\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import classification_report, roc_auc_score, confusion_matrix\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.linear_model import LogisticRegression\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nfrom scripts.config import RAW_DATA_DIR, MODEL_CONFIG\nfrom scripts.generate_sample_data import generate_sample_dataset\nfrom scripts.data_loader import load_csv, split_data, build_preprocessor

In [None]:
# Load data\ndata_file = 'credit_scoring_data.csv'\ndata_path = os.path.join(RAW_DATA_DIR, data_file)\nif not os.path.exists(data_path):\n    generate_sample_dataset(n_samples=1000)\n\ndf = load_csv(data_file)\nprint(f'Dataset shape: {df.shape}')

In [None]:
# Split data\nX_train, X_test, y_train, y_test = split_data(df)\nprint(f'Training set: {X_train.shape}')\nprint(f'Test set: {X_test.shape}')

In [None]:
# Build preprocessor\npreprocessor = build_preprocessor(X_train)\npreprocessor

In [None]:
# Baseline models\nmodels = {\n    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),\n    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100)\n}

In [None]:
baseline_results = {}\nfor name, model in models.items():\n    pipeline = Pipeline([\n        ('preprocessor', preprocessor),\n        ('model', model)\n    ])\n    \n    # Train\n    pipeline.fit(X_train, y_train)\n    \n    # Predict\n    y_pred = pipeline.predict(X_test)\n    y_prob = pipeline.predict_proba(X_test)[:, 1]\n    \n    # Evaluate\n    auc = roc_auc_score(y_test, y_prob)\n    \n    baseline_results[name] = {\n        'pipeline': pipeline,\n        'auc': auc,\n        'predictions': y_pred\n    }\n    \n    print(f'{name} - AUC: {auc:.4f}')\n    print(classification_report(y_test, y_pred))\n    \n    # Confusion Matrix\n    cm = confusion_matrix(y_test, y_pred)\n    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n    plt.title(f'Confusion Matrix - {name}')\n    plt.ylabel('Actual')\n    plt.xlabel('Predicted')\n    plt.show()

In [None]:
# Compare models\nauc_scores = [result['auc'] for result in baseline_results.values()]\nmodel_names = list(baseline_results.keys())\n\nplt.figure(figsize=(10, 6))\nbars = plt.bar(model_names, auc_scores, color=['blue', 'green'])\nplt.xlabel('Model')\nplt.ylabel('AUC Score')\nplt.title('Baseline Model Comparison')\nplt.ylim(0, 1)\nplt.show()