## IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import warnings
from datetime import datetime

# Sklearn utilities
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## LOAD ENGINEERED FEATURES

In [2]:
# Load scaled features
df = pd.read_csv('../dataset/processed/engineered_features_scaled.csv')

# Load metadata
with open('../dataset/processed/feature_metadata.json', 'r') as f:
    metadata = json.load(f)

print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
print(f"\nTotal samples: {len(df)}")
print(f"Number of features: {len(metadata['feature_names'])}")
print(f"\nFeatures: {metadata['feature_names']}")
print(f"\nTarget distribution:")
print(df['score'].describe())

DATA OVERVIEW

Total samples: 42650
Number of features: 16

Features: ['skill_jaccard', 'skill_coverage', 'skill_precision', 'skill_overlap_count', 'job_skills_count', 'cv_skills_count', 'experience_gap', 'experience_ratio', 'experience_match', 'job_experience_required', 'cv_experience_years', 'education_gap', 'education_match', 'job_education_level', 'cv_education_level', 'seniority_match_score']

Target distribution:
count    42650.000000
mean        33.398676
std         14.320120
min          2.010000
25%         23.450000
50%         30.360000
75%         41.250000
max        100.000000
Name: score, dtype: float64


## PREPARE DATA SPLITS

In [5]:
feature_cols = metadata['feature_names']
X = df[feature_cols]
y = df['score']

# Split theo tỉ lệ 70/15/15
print("Splitting data into train/validation/test sets...")
print("Strategy: 70% train, 15% validation, 15% test\n")

# Vì công cụ train_test_split chỉ chia được 1 tập thành 2 tập => ta sẽ phải chia 2 lần

# Lần split đầu: 70% tập train, 30% tập temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True
)

# Lần split 2nd: split tập temp thành tập validation và tập test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True
)

print(f"Training set:   {len(X_train)} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(df)*100:.1f}%)")
print(f"Test set:       {len(X_test)} samples ({len(X_test)/len(df)*100:.1f}%)")

print("\nTarget distribution across splits:")
print(f"Train:      mean={y_train.mean():.2f}, std={y_train.std():.2f}")
print(f"Validation: mean={y_val.mean():.2f}, std={y_val.std():.2f}")
print(f"Test:       mean={y_test.mean():.2f}, std={y_test.std():.2f}")

Splitting data into train/validation/test sets...
Strategy: 70% train, 15% validation, 15% test

Training set:   29855 samples (70.0%)
Validation set: 6397 samples (15.0%)
Test set:       6398 samples (15.0%)

Target distribution across splits:
Train:      mean=33.40, std=14.36
Validation: mean=33.59, std=14.26
Test:       mean=33.22, std=14.19
