In [None]:
# 📦 IMPORTS
# --- General-purpose libraries ---
import os                     # File and directory operations
import pandas as pd            # Data manipulation and analysis
import numpy as np             # Numerical computations

# --- Visualization libraries ---
import seaborn as sns          # Advanced data visualization (heatmaps, boxplots, etc.)
import matplotlib.pyplot as plt  # Plotting library

# --- Scikit-learn: data splitting and preprocessing ---
from sklearn.model_selection import train_test_split  # Split data into train/validation/test sets
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler  # Scaling methods

# --- Scikit-learn: feature selection and regression models ---
from sklearn.feature_selection import RFE             # Recursive Feature Elimination
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV  # Linear, Ridge, and Lasso regressors
from sklearn.ensemble import RandomForestRegressor    # Ensemble-based regressor
from sklearn.tree import DecisionTreeRegressor        # Simple tree-based regressor

# --- Scikit-learn: classification models ---
from sklearn.linear_model import LogisticRegression   # Linear model for classification
from sklearn.tree import DecisionTreeClassifier        # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier    # Random forest classifier

# --- Statistical and diagnostic tools ---
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance Inflation Factor (multicollinearity)
from scipy.stats import spearmanr                   # Spearman correlation (non-parametric)

# --- Visualization theme ---
sns.set(style="whitegrid", context="notebook")


In [3]:

# 📂 LOAD RAW DATA
train_relative_path = '../Data/train.csv'
test_relative_path = '../Data/test.csv'

train_data = pd.read_csv(train_relative_path)
test_data = pd.read_csv(test_relative_path)

print("=== DATASET OVERVIEW ===")
print(f"Training set shape: {train_data.shape}")
print(f"Test set shape: {test_data.shape}")
print(f"Total features: {len(train_data.columns)}")

numerical_features = train_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

print("\n=== SAMPLE DATA ===")
display(train_data.head())


=== DATASET OVERVIEW ===
Training set shape: (75973, 14)
Test set shape: (32567, 13)
Total features: 14
Numerical features: 10
Categorical features: 4

=== SAMPLE DATA ===


Unnamed: 0,carID,Brand,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
0,69512,VW,Golf,2016.0,22290,Semi-Auto,28421.0,Petrol,,11.417268,2.0,63.0,4.0,0.0
1,53000,Toyota,Yaris,2019.0,13790,Manual,4589.0,Petrol,145.0,47.9,1.5,50.0,1.0,0.0
2,6366,Audi,Q2,2019.0,24990,Semi-Auto,3624.0,Petrol,145.0,40.9,1.5,56.0,4.0,0.0
3,29021,Ford,FIESTA,2018.0,12500,anual,9102.0,Petrol,145.0,65.7,1.0,50.0,-2.340306,0.0
4,10062,BMW,2 Series,2019.0,22995,Manual,1000.0,Petrol,145.0,42.8,1.5,97.0,3.0,0.0


In [4]:
# 📊 NUMERIC FEATURE DISTRIBUTIONS
output_dir = "[ML]_Project_EDAOutputs_Group33"
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, "Numeric_Variables_Histograms_Boxplots.png")
palette = sns.color_palette("Spectral", 8)

numeric_train = train_data.select_dtypes(include=[np.number]).columns
numeric_test = test_data.select_dtypes(include=[np.number]).columns
metric_cols = [col for col in numeric_train if col in numeric_test]

print(f"Found {len(metric_cols)} numeric columns present in both datasets.")

if os.path.isfile(output_file):
    print(f"File already exists: {output_file}. Skipping plot generation.")
else:
    print(f"Generating plot for {len(metric_cols)} numeric variables...")

    sp_cols = 5
    sp_rows = (len(metric_cols) + sp_cols - 1) // sp_cols

    fig, axes = plt.subplots(
        sp_rows * 2, sp_cols,
        figsize=(20, 6 * sp_rows),
        tight_layout=False,
        gridspec_kw={'height_ratios': [0.2, 0.8] * sp_rows}
    )

    for i, (ax_box, ax_hist, feat) in enumerate(zip(axes[::2].flatten(), axes[1::2].flatten(), metric_cols)):
        if feat not in train_data.columns or feat not in test_data.columns:
            continue

        data = pd.concat([train_data[[feat]], test_data[[feat]]], axis=1)
        data.columns = ['Train', 'Test']

        sns.boxplot(data=data[['Train', 'Test']], palette=[palette[0], palette[4]], orient='h', ax=ax_box)
        ax_box.set_xlabel(None)
        ax_box.set_ylabel(None)

        sns.histplot(train_data[feat], color=palette[0], kde=True, stat='percent', bins=50, alpha=0.4, ax=ax_hist, label='Train')
        sns.histplot(test_data[feat], color=palette[4], kde=True, stat='percent', bins=50, alpha=0.3, ax=ax_hist, label='Test')

        for data_type, color_idx, line_style, alpha_val in [('Train', 2, '--', 0.8), ('Test', 2, '-', 0.5)]:
            vals = train_data[feat] if data_type == 'Train' else test_data[feat]
            ax_hist.axvline(vals.mean(), color=palette[color_idx], linestyle=line_style, linewidth=1.5, alpha=alpha_val, label=f'{data_type} Mean: {vals.mean():.1f}')
            ax_hist.axvline(vals.median(), color=palette[color_idx+1], linestyle=line_style, linewidth=1.5, alpha=alpha_val, label=f'{data_type} Median: {vals.median():.1f}')

        ax_hist.set_title(feat, y=-0.20, fontweight='bold')
        ax_hist.set_xlabel(None)
        if i % sp_cols == 0:
            ax_hist.set_ylabel('Count (n)\n', fontsize=10, fontweight='bold')
        else:
            ax_hist.set_ylabel(None)

        sns.despine(top=True, right=True, ax=ax_hist)
        sns.despine(top=True, right=True, bottom=True, ax=ax_box)
        ax_hist.legend(fontsize=7, frameon=False, loc='best')

    for j in range(len(metric_cols)*2, len(axes)):
        fig.delaxes(axes.flatten()[j])

    plt.suptitle("Numeric Variables' Histograms with Boxplots", fontweight='bold', fontsize=16)
    fig.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"✅ Plot saved as: {output_file}")


Found 9 numeric columns present in both datasets.
File already exists: [ML]_Project_EDAOutputs_Group33/Numeric_Variables_Histograms_Boxplots.png. Skipping plot generation.


In [5]:
# 🧹 LOAD CLEANED DATA AND SPLIT
clean_train_df = pd.read_csv("../Data/clean_data_train.csv")
print(f"✅ Clean data loaded successfully. Shape: {clean_train_df.shape}")

✅ Clean data loaded successfully. Shape: (75973, 16)


In [6]:
print(clean_train_df.columns.tolist())

['carID', 'Brand', 'model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'paintQuality%', 'previousOwners', 'hasDamage', 'Brand_cleaned', 'Brand_confidence', 'price']


In [7]:
target_col = "price"
X = clean_train_df.drop(columns=[target_col])
y = clean_train_df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(f"✅ Training shape: {X_train.shape}")
print(f"✅ Validation shape: {X_val.shape}")

numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"📊 Numeric features: {len(numeric_features)} -> {numeric_features}")
print(f"🔤 Categorical features: {len(categorical_features)} -> {categorical_features}")


✅ Training shape: (60778, 15)
✅ Validation shape: (15195, 15)
📊 Numeric features: 13 -> ['carID', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'paintQuality%', 'previousOwners', 'hasDamage', 'Brand_cleaned', 'Brand_confidence']
🔤 Categorical features: 2 -> ['Brand', 'model']


In [13]:
# 🎯 FEATURE SELECTION CLASS
class NumericalFeatureSelector:
    """
    Classe para seleção de features numéricas em problemas de regressão.
    Usa apenas dados de treino para evitar data leakage.
    """

    def __init__(self, X_train, y_train, numeric_features, X_val=None, y_val=None, vif_threshold=5, corr_threshold=0.7):
        self.X_train = X_train[numeric_features].copy()
        self.y_train = y_train.copy()
        self.X_val = X_val[numeric_features].copy() if X_val is not None else None
        self.y_val = y_val.copy() if y_val is not None else None
        self.numeric_features = numeric_features
        self.vif_threshold = vif_threshold
        self.corr_threshold = corr_threshold

    # 1️⃣ Multicolinearidade / Redundância
    def vif_analysis(self):
        X = self.X_train.dropna().copy()
        vif_data = pd.DataFrame()
        vif_data["Feature"] = X.columns
        vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        vif_data["Accepted"] = vif_data["VIF"] < self.vif_threshold
        return vif_data

    def spearman_redundancy(self):
        corr = self.X_train.corr(method='spearman').abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        redundancy_df = pd.DataFrame({
            "Feature": self.X_train.columns,
            "Max_SpearmanCorr": [upper[col].max(skipna=True) for col in self.X_train.columns],
        })
        redundancy_df["Accepted"] = redundancy_df["Max_SpearmanCorr"] < self.corr_threshold
        return redundancy_df

    # 2️⃣ Correlação com o Target (Relevância)
    def spearman_relevance(self, threshold=0.1):
        corr_values = []
        for col in self.X_train.columns:
            corr, _ = spearmanr(self.X_train[col], self.y_train)
            corr_values.append(abs(corr))
        corr_df = pd.DataFrame({
            "Feature": self.X_train.columns,
            "Spearman_TargetCorr": corr_values
        })
        corr_df["Accepted"] = corr_df["Spearman_TargetCorr"] > threshold
        return corr_df

    # 3️⃣ Recursive Feature Elimination (RFE)
    def rfe_model(self, model, scaler=None):
        X = self.X_train.copy()
        if scaler:
            X = scaler.fit_transform(X)
        rfe = RFE(model)
        rfe.fit(X, self.y_train)
        results = pd.DataFrame({"Feature": self.X_train.columns, "Accepted": rfe.support_})
        return results

    def rfe_all_models(self):
        models = {
            "RFE_DecisionTree": DecisionTreeRegressor(random_state=42),
            "RFE_RandomForest": RandomForestRegressor(random_state=42, n_estimators=100),
            "RFE_LR_MinMax": (LinearRegression(), MinMaxScaler()),
            "RFE_LR_Standard": (LinearRegression(), StandardScaler()),
            "RFE_LR_Robust": (LinearRegression(), RobustScaler())
        }
        results = []
        for name, model in models.items():
            df = self.rfe_model(model[0], model[1]) if isinstance(model, tuple) else self.rfe_model(model)
            df = df.rename(columns={"Accepted": name})
            df = df[["Feature", name]]
            results.append(df)
        return results

    # 4️⃣ Regularização Ridge/Lasso
    def regularization_model(self, model_type="ridge", scaler=None, threshold=0.01):
        X = self.X_train.copy()
        if scaler:
            X = scaler.fit_transform(X)
        model = RidgeCV(alphas=np.logspace(-3, 3, 50)) if model_type == "ridge" else LassoCV(alphas=np.logspace(-3, 3, 50), max_iter=10000)
        model.fit(X, self.y_train)
        coefs = np.abs(model.coef_)
        df = pd.DataFrame({
            "Feature": self.X_train.columns,
            f"{model_type.capitalize()}_Coef": coefs,
            "Accepted": coefs > threshold
        })
        return df

    def ridge_all(self):
        scalers = {"Ridge_MinMax": MinMaxScaler(), "Ridge_Standard": StandardScaler(), "Ridge_Robust": RobustScaler()}
        results = []
        for name, scaler in scalers.items():
            df = self.regularization_model("ridge", scaler)
            df = df.rename(columns={"Accepted": name})
            df = df[["Feature", name]]
            results.append(df)
        return results

    def lasso_all(self):
        scalers = {"Lasso_MinMax": MinMaxScaler(), "Lasso_Standard": StandardScaler(), "Lasso_Robust": RobustScaler()}
        results = []
        for name, scaler in scalers.items():
            df = self.regularization_model("lasso", scaler)
            df = df.rename(columns={"Accepted": name})
            df = df[["Feature", name]]
            results.append(df)
        return results

    # 5️⃣ Tabela Final
    def compile_results(self):
        results = [
            self.vif_analysis(),
            self.spearman_redundancy(),
            self.spearman_relevance(),
            *self.rfe_all_models(),
            *self.ridge_all(),
            *self.lasso_all()
        ]
        merged = results[0][["Feature"]]
        for df in results:
            merged = merged.merge(df, on="Feature", how="left")

        accept_cols = [c for c in merged.columns if "RFE_" in c or "Ridge_" in c or "Lasso_" in c or "Accepted" in c]
        merged["Total_Accepted"] = merged[accept_cols].sum(axis=1)
        merged["Final_Decision"] = np.where(merged["Total_Accepted"] > len(accept_cols) / 2, "Keep", "Drop")
        return merged


In [14]:
# Inicialização da classe FeatureSelector

fs = NumericalFeatureSelector(
    X_train=X_train,
    y_train=y_train,
    numeric_features=numeric_features,
    X_val=X_val,
    y_val=y_val
)

# ================================================================
# 🔍 Apply Selection Methods
# ================================================================
vif_results = fs.vif_analysis()
spearman_redundancy = fs.spearman_redundancy()
spearman_relevance = fs.spearman_relevance(threshold=0.1)

rfe_results = fs.rfe_all_models()
ridge_results = fs.ridge_all()
lasso_results = fs.lasso_all()

# ================================================================
# 🧾 Compile Final Table
# ================================================================
final_results = fs.compile_results()

print("\n=== 🔹 Feature Selection Summary ===")
display(final_results.head(15))

# ================================================================
# 💾 (Optional) Save results
# ================================================================
final_results.to_csv("../Data/feature_selection_summary.csv", index=False)
print("✅ Feature selection summary saved successfully!")


=== 🔹 Feature Selection Summary ===


Unnamed: 0,Feature,VIF,Accepted_x,Max_SpearmanCorr,Accepted_y,Spearman_TargetCorr,Accepted,RFE_DecisionTree,RFE_RandomForest,RFE_LR_MinMax,RFE_LR_Standard,RFE_LR_Robust,Ridge_MinMax,Ridge_Standard,Ridge_Robust,Lasso_MinMax,Lasso_Standard,Lasso_Robust,Total_Accepted,Final_Decision
0,carID,4.543459,True,,False,0.206315,True,False,False,False,False,False,True,True,True,True,True,True,8,Keep
1,year,15.112083,False,0.011737,True,0.590249,True,True,True,True,True,True,True,True,True,True,True,True,13,Keep
2,transmission,17.573693,False,0.178623,True,0.576917,True,True,True,True,True,True,True,True,True,True,True,True,13,Keep
3,mileage,1.469794,True,0.770482,False,0.513255,True,True,True,True,True,True,True,True,True,True,True,True,13,Keep
4,fuelType,59.461892,False,0.249249,True,0.261125,True,False,False,False,False,False,True,True,True,True,True,True,8,Keep
5,tax,1.450614,True,0.298805,True,0.300682,True,False,False,False,False,False,True,True,True,True,True,True,9,Keep
6,mpg,1.740166,True,0.533157,True,0.372304,True,True,True,True,True,True,True,True,True,True,True,True,14,Keep
7,engineSize,1.596378,True,0.579009,True,0.559002,True,True,True,True,True,True,True,True,True,True,True,True,14,Keep
8,paintQuality%,9.931854,False,0.006204,True,0.002123,False,False,False,False,False,False,True,True,True,True,True,True,7,Drop
9,previousOwners,1.000117,True,0.00461,True,0.001023,False,False,False,False,False,False,True,True,True,True,True,True,8,Keep


✅ Feature selection summary saved successfully!


In [20]:
# 📦 Advanced Feature Selection for Encoded Categorical Data

sns.set(style="whitegrid", context="notebook")


class CategoricalFeatureSelector:
    def __init__(self, X_train, y_train, vif_threshold=10.0, corr_threshold=0.8):
        self.X_train = X_train
        self.y_train = y_train
        self.features = X_train.columns
        self.vif_threshold = vif_threshold
        self.corr_threshold = corr_threshold

    # ======================================================
    # 1️⃣ VIF Analysis
    # ======================================================
    def vif_analysis(self):
        X = self.X_train.astype(float)
        vif_data = pd.DataFrame()
        vif_data["Feature"] = X.columns
        vif_data["VIF"] = [self._safe_vif(X, i) for i in range(X.shape[1])]
        vif_data["Accepted"] = vif_data["VIF"] < self.vif_threshold
        return vif_data

    def _safe_vif(self, X, idx):
        try:
            return variance_inflation_factor(X.values, idx)
        except Exception:
            return np.nan

    # ======================================================
    # 2️⃣ Spearman Redundancy (proxy for Cramér's V)
    # ======================================================
    def redundancy(self):
        corr = self.X_train.corr(method='spearman').abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        to_drop = [c for c in upper.columns if any(upper[c] > self.corr_threshold)]
        accepted = ~self.X_train.columns.isin(to_drop)
        return pd.DataFrame({"Feature": self.X_train.columns, "Accepted": accepted})

    # ======================================================
    # 3️⃣ RFE with Different Models
    # ======================================================
    def rfe_results(self, n_features=10):
        models = {
            "RFE Decision Tree": DecisionTreeClassifier(random_state=42),
            "RFE Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
            "RFE LR MinMax": LogisticRegression(max_iter=500, solver="lbfgs"),
            "RFE LR Standard": LogisticRegression(max_iter=500, solver="lbfgs"),
            "RFE LR Robust": LogisticRegression(max_iter=500, solver="lbfgs"),
        }

        scalers = {
            "RFE LR MinMax": MinMaxScaler(),
            "RFE LR Standard": StandardScaler(),
            "RFE LR Robust": RobustScaler(),
        }

        results = {}

        for name, model in models.items():
            X = self.X_train
            if name in scalers:
                X = pd.DataFrame(scalers[name].fit_transform(X), columns=self.X_train.columns)

            rfe = RFE(model, n_features_to_select=n_features)
            rfe.fit(X, self.y_train)
            results[name] = rfe.support_

        return pd.DataFrame(results, index=self.features)

    # ======================================================
    # 4️⃣ Lasso & Ridge with 3 Scalers
    # ======================================================
    def lasso_ridge_results(self, threshold_factor=1.25, C=1, max_iter=500):
        scalers = {
            "MinMax": MinMaxScaler(),
            "Standard": StandardScaler(),
            "Robust": RobustScaler()
        }
        results = {}

        for scale_name, scaler in scalers.items():
            X_scaled = scaler.fit_transform(self.X_train)
            y = self.y_train

            # --- Lasso ---
            lasso_selector = SelectFromModel(
                LogisticRegression(C=C, penalty="l1", solver="saga", max_iter=max_iter),
                threshold=f"{threshold_factor}*mean"
            ).fit(X_scaled, y)
            results[f"Lasso {scale_name}"] = lasso_selector.get_support()

            # --- Ridge ---
            ridge_selector = SelectFromModel(
                LogisticRegression(C=C, penalty="l2", solver="sag", max_iter=max_iter),
                threshold=f"{threshold_factor}*mean"
            ).fit(X_scaled, y)
            results[f"Ridge {scale_name}"] = ridge_selector.get_support()

        return pd.DataFrame(results, index=self.features)

    # ======================================================
    # 5️⃣ Chi-Squared Test
    # ======================================================
    def chi_squared_results(self):
        X = self.X_train.copy()
        X[X < 0] = 0  # chi2 requires non-negative features
        chi_scores, p_values = chi2(X, self.y_train)
        accepted = p_values < 0.05
        return pd.DataFrame({"Chi-Squared": accepted}, index=self.features)

    # ======================================================
    # 6️⃣ Combine Everything into Final Decision Table
    # ======================================================
    def build_summary_table(self):
        vif_df = self.vif_analysis()[["Feature", "Accepted"]].set_index("Feature").rename(columns={"Accepted": "VIF"})
        red_df = self.redundancy().set_index("Feature").rename(columns={"Accepted": "Redundancy"})

        rfe_df = self.rfe_results()
        lr_df = self.lasso_ridge_results()
        chi_df = self.chi_squared_results()

        summary = pd.concat([vif_df, red_df, rfe_df, lr_df, chi_df], axis=1)
        summary = summary.fillna(False)
        summary["Accepted_Count"] = summary.sum(axis=1)
        summary["What to Do?"] = summary["Accepted_Count"].apply(lambda x: "Keep" if x >= summary.shape[1] * 0.5 else "Remove")

        summary.reset_index(inplace=True)
        summary.insert(0, "#", range(1, len(summary) + 1))
        return summary

    # ======================================================
    # 7️⃣ Optional Plot for Ridge/Lasso
    # ======================================================
    def plot_lasso_ridge(self, lasso_coef, ridge_coef, title_suffix=""):
        fig, ax = plt.subplots(1, 2, figsize=(18, 25), sharey=True)
        ridge_coef.sort_values(ascending=True).plot(kind='barh', color='#3182BD', ax=ax[0])
        lasso_coef.sort_values(ascending=True).plot(kind='barh', color='#6BAED6', ax=ax[1])
        ax[0].set_title(f"Ridge Importance {title_suffix}")
        ax[1].set_title(f"Lasso Importance {title_suffix}")
        plt.tight_layout()
        plt.show()


In [22]:
cat_fs = CategoricalFeatureSelector(X_train, y_train)

summary_table = cat_fs.build_summary_table()

# Display the final summary
display(summary_table)

# Save to Excel if you want
summary_table.to_excel("Categorical_Feature_Selection_Summary.xlsx", index=False)


ValueError: could not convert string to float: 'Ford'