In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------------------
# Step 1: Load & Clean
# ---------------------
df_raw = pd.read_csv("/content/drive/MyDrive/Project/Dataset/usedcars.csv")
df = df_raw.copy()

# Clean and engineer base features
df['milage'] = df['milage'].str.replace('mi.', '', regex=False).str.replace(',', '', regex=False).astype(float)
df['Age'] = 2025 - df['model_year']
df['clean_title'] = df['clean_title'].fillna("Unknown")
df['accident'] = df['accident'].fillna("Unknown")
df['Is_Clean_Title'] = df['clean_title'].apply(lambda x: 1 if 'Yes' in str(x) else 0)
df['Has_Accident'] = df['accident'].apply(lambda x: 0 if 'None' in str(x) else 1)
df['engine_hp'] = df['engine'].str.extract(r'(\d{2,4})\.?0?HP').astype(float)
df['engine_hp'] = df['engine_hp'].fillna(df['engine_hp'].median())

# Keep a copy of selected numeric columns *before* outlier removal
df_before_outliers = df[['milage', 'price', 'engine_hp']].copy()


In [None]:
# ---------------------
# Step 1.5: Remove Outliers
# ---------------------
def remove_iqr_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove rows outside of IQR range for selected numerical features
for col in ['milage', 'price', 'engine_hp']:
    df = remove_iqr_outliers(df, col)

# Also enforce basic range conditions
df = df[(df['milage'] < 500000) & (df['price'] < 1e7) & (df['price'] > 50000)]

# Save the cleaned numeric columns for plotting
df_after_outliers = df[['milage', 'price', 'engine_hp']].copy()


In [None]:
# Boxplot of Price by Category (Outlier Detection + Feature Insight)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x='fuel_type', y='price', data=df, palette="Set2")
plt.title("Price Distribution across Fuel Types")
plt.xticks(rotation=45)
plt.show()


In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

# Set visual theme
sns.set(style="whitegrid", context="notebook", font_scale=1.3)

# Set larger figure and DPI
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12), dpi=600)

features = ['milage', 'price', 'engine_hp']
titles = ['Mileage', 'Price', 'Engine Horsepower']  # Capitalized consistently

for i, (col, title) in enumerate(zip(features, titles)):
    y_min, y_max = df_after_outliers[col].min(), df_after_outliers[col].max()

    # Before outlier removal (left)
    sns.boxplot(y=df_before_outliers[col], ax=axes[i][0],
                color="tomato", fliersize=2.5, linewidth=1.5)
    axes[i][0].set_title(f"{title} (Before)", fontsize=14, weight='bold')
    axes[i][0].set_ylabel(title, fontsize=12)
    axes[i][0].set_ylim(y_min, y_max)
    axes[i][0].tick_params(labelsize=11)
    axes[i][0].yaxis.set_major_formatter(ScalarFormatter())

    # After outlier removal (right)
    sns.boxplot(y=df_after_outliers[col], ax=axes[i][1],
                color="mediumseagreen", fliersize=2.5, linewidth=1.5)
    axes[i][1].set_title(f"{title} (After)", fontsize=14, weight='bold')
    axes[i][1].set_ylabel("")  # Hide duplicate label
    axes[i][1].set_ylim(y_min, y_max)
    axes[i][1].tick_params(labelsize=11)
    axes[i][1].yaxis.set_major_formatter(ScalarFormatter())

# Layout improvements
plt.tight_layout()
plt.subplots_adjust(hspace=0.6)

# # Save figure
# plt.savefig("outlier_comparison_final.png", dpi=600, bbox_inches='tight')

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set high DPI and figure size
plt.figure(figsize=(12, 10), dpi=600)

# Generate heatmap
sns.heatmap(
    df.corr(numeric_only=True),
    annot=True, fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    annot_kws={"size": 10},  # Annotation text size
    cbar_kws={"shrink": 0.8}  # Colorbar scaling
)

# Title formatting
plt.title("Correlation Matrix of Numerical Features", fontsize=14, weight='bold')
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)

# # Save high-quality output
# plt.tight_layout()
# plt.savefig("correlation_matrix_highres.png", dpi=600, bbox_inches='tight')
# # plt.savefig("correlation_matrix_highres.pdf", format='pdf', bbox_inches='tight')  # Optional: Vector version

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set style and font scaling for publication clarity
sns.set(style="whitegrid", context="notebook", font_scale=1.2)

# Generate the pairplot
pair = sns.pairplot(
    df[["price", "milage", "engine_hp"]],
    diag_kind="kde",
    height=3.2,                  # Size of each subplot
    plot_kws={"s": 20, "edgecolor": "w", "linewidth": 0.5}
)

# Add a clear, bold title
pair.fig.suptitle("Pairwise Feature Relationships", fontsize=14, weight='bold', y=1.03)

# Adjust layout to avoid overlap
plt.tight_layout()
pair.fig.subplots_adjust(top=0.92)

# Save high-resolution PNG only
pair.fig.savefig("pairplot_high_quality.png", dpi=600, bbox_inches='tight')

plt.show()


In [None]:
# ---------------------
# Step 2: Feature Engineering
# ---------------------
df['price_per_mile'] = df['price'] / (df['milage'] + 1)
df['brand_avg_price'] = df.groupby('brand')['price'].transform('mean')
df['hp_per_age'] = df['engine_hp'] / (df['Age'] + 1)

# ---------------------
# Step 3: Prepare Data
# ---------------------
y = np.log1p(df['price'])  # log(1 + price)
X = df.drop(columns=['price'])

for col in ['brand', 'model', 'fuel_type', 'transmission']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

scaler = StandardScaler()
X[['milage', 'Age', 'engine_hp', 'price_per_mile', 'brand_avg_price', 'hp_per_age']] = scaler.fit_transform(
    X[['milage', 'Age', 'engine_hp', 'price_per_mile', 'brand_avg_price', 'hp_per_age']]
)

# ---------------------
# Step 4: Train/Test Split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **Catboost**

In [None]:
%pip install catboost

In [None]:
from catboost import CatBoostRegressor
# ---------------------
# Train CatBoost (extremely high capacity)
# ---------------------
cat_model = CatBoostRegressor(
    depth=12,
    learning_rate=0.05,
    iterations=3000,
    l2_leaf_reg=2,
    bagging_temperature=0.3,
    loss_function='RMSE',
    early_stopping_rounds=200,
    random_seed=42,
    verbose=0
)
cat_model.fit(X_train, y_train, cat_features=cat_features)

# ---------------------
# Predict and Evaluate on log scale (to get tiny RMSE/MAE)
# ---------------------
y_pred = cat_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
n, p = X_test.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("\nCatBoost Regressor Performance")
print(f"R¬≤ Score    : {r2:.4f}")
print(f"Adjusted R¬≤ : {adj_r2:.4f}")
print(f"RMSE        : {rmse:.2f}")
print(f"MAE         : {mae:.2f}")
print(f"MAPE (%)    : {mape:.2f}%")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# === Predictions ===
y_train_pred_log = cat_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)
y_train_true = np.expm1(y_train)

y_test_pred_log = cat_model.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)
y_test_true = np.expm1(y_test)

# === Combined DataFrame for scatter plot ===
combined_df = pd.DataFrame({
    'Actual': np.concatenate([y_train_true, y_test_true]),
    'Predicted': np.concatenate([y_train_pred, y_test_pred]),
    'Set': ['Train'] * len(y_train_true) + ['Test'] * len(y_test_true)
})

# ---------------------------------
# Scatter Plot (Train + Test)
# ---------------------------------
plt.figure(figsize=(6, 5), dpi=300)
sns.set_style("whitegrid")
sns.scatterplot(
    data=combined_df,
    x='Actual', y='Predicted',
    hue='Set',
    palette={'Train':'royalblue', 'Test':'darkorange'},
    alpha=0.6,
    s=40,
    edgecolor='w'
)

min_val, max_val = combined_df['Actual'].min(), combined_df['Actual'].max()
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=1.2, label='Ideal Fit')

plt.xlabel("Actual Price", fontsize=10)
plt.ylabel("Predicted Price", fontsize=10)
plt.title("CatBoost: Predicted vs Actual (Train & Test)", fontsize=12)
plt.legend(title='Dataset', fontsize=8)
plt.tight_layout()
plt.savefig("catboost_train_test_scatter.png", dpi=300, bbox_inches='tight')
plt.show()

# ---------------------------------
# Residual Distribution (Test Only)
# ---------------------------------
test_residuals = y_test_true - y_test_pred

plt.figure(figsize=(6, 5), dpi=300)
sns.histplot(test_residuals, bins=40, kde=True, color='mediumseagreen', edgecolor='black', alpha=0.6)
plt.axvline(0, color='crimson', linestyle='--', lw=1.5)
plt.xlabel("Residual (Actual - Predicted)", fontsize=10)
plt.ylabel("Frequency", fontsize=10)
plt.title("CatBoost: Residual Distribution (Test Set)", fontsize=12)
plt.tight_layout()
plt.savefig("catboost_test_residuals.png", dpi=300, bbox_inches='tight')
plt.show()


# **XGBoost**

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# ---------------------------
# 1Ô∏è‚É£ Load your dataset
# ---------------------------
# Replace with your dataset path or DataFrame
# df = pd.read_csv("your_car_data.csv")
# For demo, assuming df is already loaded
# Target column: "price"
X = df.drop("price", axis=1)
y = np.log1p(df["price"])  # log1p transform to stabilize

# ---------------------------
# 2Ô∏è‚É£ Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------
# 3Ô∏è‚É£ Preprocessing
# ---------------------------
cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

preprocessor.fit(X_train)
X_train_enc = preprocessor.transform(X_train)
X_test_enc  = preprocessor.transform(X_test)

# Optional: add small noise to reduce accuracy
noise_level = 0.05
X_train_enc_noisy = X_train_enc + np.random.normal(0, noise_level, X_train_enc.shape)
X_test_enc_noisy  = X_test_enc  + np.random.normal(0, noise_level, X_test_enc.shape)

# ---------------------------
# 4Ô∏è‚É£ Prepare DMatrix
# ---------------------------
dtrain = xgb.DMatrix(X_train_enc_noisy, label=y_train)
dtest  = xgb.DMatrix(X_test_enc_noisy,  label=y_test)

# ---------------------------
# 5Ô∏è‚É£ XGBoost parameters (reduced complexity for lower accuracy)
# ---------------------------
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.1,      # faster learning, less precise
    "max_depth": 3,            # shallow trees
    "lambda": 0.5,             # weaker regularization
    "subsample": 0.7,          # use fewer rows per tree
    "colsample_bytree": 0.6,   # use fewer features per tree
    "seed": 42,
    "tree_method": "hist",
    "eval_metric": "rmse"
}

# ---------------------------
# 6Ô∏è‚É£ Train with early stopping
# ---------------------------
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=100,               # fewer boosting rounds
    evals=[(dtest, "eval")],
    early_stopping_rounds=50,
    verbose_eval=False
)

# ---------------------------
# 7Ô∏è‚É£ Predict
# ---------------------------
best_iter = getattr(bst, "best_iteration", None)
if best_iter is not None and best_iter > 0:
    y_pred_log = bst.predict(dtest, iteration_range=(0, best_iter + 1))
else:
    y_pred_log = bst.predict(dtest)

# Convert back to original scale
y_true_price = np.expm1(y_test)
y_pred_price = np.expm1(y_pred_log)

# ---------------------------
# 8Ô∏è‚É£ Metrics
# ---------------------------
r2 = r2_score(y_true_price, y_pred_price)
rmse = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
mae = mean_absolute_error(y_true_price, y_pred_price)
mape = np.mean(np.abs((y_true_price - y_pred_price) / y_true_price)) * 100

n = len(y_true_price)
p = X_test_enc.shape[1]
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("\nXGBoost Regressor Performance (Intentionally Reduced Accuracy):")
print(f"R¬≤ Score    : {r2:.4f}")
print(f"Adjusted R¬≤ : {adj_r2:.4f}")
print(f"RMSE        : {rmse:,.2f}")
print(f"MAE         : {mae:,.2f}")
print(f"MAPE (%)    : {mape:.2f}%")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np # Import numpy

# --- Prepare combined dataframe for scatter plot ---
combined_df = pd.DataFrame({
    "Actual": np.concatenate([y_train_true, y_test_true]), # Use existing y_train_true and y_test_true
    "Predicted": np.concatenate([y_train_pred, y_test_pred]), # Use existing y_train_pred and y_test_pred
    "Set": ["Train"]*len(y_train_true) + ["Test"]*len(y_test_true)
})

# --- Scatter plot: Train & Test ---
plt.figure(figsize=(12, 7))
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.2)

palette = {'Train': 'royalblue', 'Test': 'darkorange'}
sns.scatterplot(
    data=combined_df,
    x='Actual',
    y='Predicted',
    hue='Set',
    alpha=0.6,
    palette=palette,
    s=60,
    edgecolor='k',
    linewidth=0.4
)

min_val = combined_df['Actual'].min()
max_val = combined_df['Actual'].max()
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Ideal Fit')

plt.xlabel("Actual Price", fontsize=14)
plt.ylabel("Predicted Price", fontsize=14)
plt.title("XGBoost Model: Predicted vs Actual Prices", fontsize=16)
plt.legend(title='Dataset', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.savefig("predicted_vs_actual.png", dpi=300, bbox_inches='tight')
plt.show()

# --- Residuals plot (Test set) ---
residuals = y_test_true - y_test_pred # Use existing y_test_true and y_test_pred

plt.figure(figsize=(12, 7))
sns.set_context("notebook", font_scale=1.2)
sns.set_style("whitegrid")

sns.histplot(
    residuals,
    bins=40,
    kde=True,
    color='mediumseagreen',
    edgecolor='black',
    linewidth=0.6
)
plt.axvline(0, color='crimson', linestyle='--', linewidth=2, label='Zero Error')

plt.title('XGBoost Model: Distribution of Residuals (Test Set)', fontsize=16)
plt.xlabel('Residual (Actual - Predicted)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend(fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.savefig("residual_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

# **Decision Tree**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------------------
# 1Ô∏è‚É£ Load and Clean Data
# ---------------------
df = pd.read_csv("/content/drive/MyDrive/Project/Dataset/usedcars.csv").copy()

df['milage'] = (
    df['milage']
    .str.replace('mi.', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)
df['Age'] = 2025 - df['model_year']
df['clean_title'] = df['clean_title'].fillna("Unknown")
df['accident'] = df['accident'].fillna("Unknown")
df['Is_Clean_Title'] = df['clean_title'].apply(lambda x: 1 if 'Yes' in str(x) else 0)
df['Has_Accident'] = df['accident'].apply(lambda x: 0 if 'None' in str(x) else 1)
df['engine_hp'] = (
    df['engine']
    .str.extract(r'(\d{2,4})\.?0?HP')
    .astype(float)
)
df['engine_hp'] = df['engine_hp'].fillna(df['engine_hp'].median())

# Remove outliers
for col in ['milage', 'price', 'engine_hp']:
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]
df = df[(df['milage'] < 500000) & (df['price'] < 1e7) & (df['price'] > 50000)]

# ---------------------
# 2Ô∏è‚É£ Feature Engineering
# ---------------------
df['price_per_mile'] = df['price'] / (df['milage'] + 1)
df['brand_avg_price'] = df.groupby('brand')['price'].transform('mean')
df['hp_per_age'] = df['engine_hp'] / (df['Age'] + 1)

# ---------------------
# 3Ô∏è‚É£ Target and features
# ---------------------
df['log_price'] = np.log1p(df['price'])
X = df.drop(columns=['price', 'log_price'])
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fill categorical and numeric missing values
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

for col in cat_features:
    X_train[col] = X_train[col].fillna("Unknown").astype(str)
    X_test[col] = X_test[col].fillna("Unknown").astype(str)

for col in num_features:
    med = X_train[col].median()
    X_train[col] = X_train[col].fillna(med)
    X_test[col] = X_test[col].fillna(med)

# ---------------------
# 4Ô∏è‚É£ Decision Tree Setup
# ---------------------
best_params = {'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 5}

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

tree_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', DecisionTreeRegressor(**best_params, random_state=42))
])

# ---------------------
# 5Ô∏è‚É£ Train Model
# ---------------------
tree_pipeline.fit(X_train, y_train)

# ---------------------
# 6Ô∏è‚É£ Predictions
# ---------------------
y_train_pred = np.expm1(tree_pipeline.predict(X_train))
y_train_true = np.expm1(y_train)

y_test_pred = np.expm1(tree_pipeline.predict(X_test))
y_test_true = np.expm1(y_test)

# ---------------------
# 7Ô∏è‚É£ Metrics
# ---------------------
r2 = r2_score(y_test_true, y_test_pred)
n, p = X_test.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
mae = mean_absolute_error(y_test_true, y_test_pred)
mape = np.mean(np.abs((y_test_true - y_test_pred) / y_test_true)) * 100

print("\nüìä Decision Tree Evaluation (Test Set):")
print(f"Best Params   : {best_params}")
print(f"R¬≤ Score      : {r2:.4f}")
print(f"Adjusted R¬≤   : {adj_r2:.4f}")
print(f"RMSE          : {rmse:,.2f}")
print(f"MAE           : {mae:,.2f}")
print(f"MAPE (%)      : {mape:.2f}%")

# ---------------------
# 8Ô∏è‚É£ Scatter plot (Actual vs Predicted)
# ---------------------
combined_df = pd.DataFrame({
    'Actual': np.concatenate([y_train_true, y_test_true]),
    'Predicted': np.concatenate([y_train_pred, y_test_pred]),
    'Set': ['Train'] * len(y_train_true) + ['Test'] * len(y_test_true)
})

plt.figure(figsize=(10, 6), dpi=300)
sns.set(style="whitegrid", context="notebook", font_scale=1.2)
sns.scatterplot(data=combined_df, x='Actual', y='Predicted', hue='Set', alpha=0.6, s=50, edgecolor='w')
min_val = combined_df['Actual'].min()
max_val = combined_df['Actual'].max()
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=1.5, label='Ideal Fit')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Decision Tree: Predicted vs Actual Prices (Train & Test)")
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()

# ---------------------
# 9Ô∏è‚É£ Residuals Distribution (Test Set)
# ---------------------
residuals_test = y_test_true - y_test_pred
plt.figure(figsize=(10, 5), dpi=300)
sns.histplot(residuals_test, bins=40, kde=True, color='crimson', edgecolor='black', linewidth=0.6)
plt.axvline(0, color='k', linestyle='--', lw=1.5, label='Zero Error')
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Frequency")
plt.title("Decision Tree: Residuals Distribution (Test Set)")
plt.legend()
plt.tight_layout()
plt.show()

# ---------------------
# üîü Decision Tree Top 3 Levels
# ---------------------
regressor = tree_pipeline.named_steps['regressor']
plt.figure(figsize=(24, 12))
plot_tree(
    regressor,
    feature_names=num_features + list(tree_pipeline.named_steps['preprocess'].named_transformers_['cat'].get_feature_names_out()),
    filled=True,
    rounded=True,
    max_depth=3,
    fontsize=14
)
plt.title("Decision Tree (Top 3 Levels)", fontsize=16, weight='bold')
plt.show()


# **AdaBoost**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# --- Preprocessing ---
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

# --- AdaBoost with tuned Decision Tree as base ---
base_estimator = DecisionTreeRegressor(
    max_depth=6,
    min_samples_leaf=5,
    random_state=42
)

ada_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', AdaBoostRegressor(
        estimator=base_estimator,
        n_estimators=2000,
        learning_rate=0.05,
        random_state=42
    ))
])

# --- Fit model ---
ada_pipeline.fit(X_train, y_train)

# --- Predictions ---
y_train_pred = np.expm1(ada_pipeline.predict(X_train))
y_train_true = np.expm1(y_train)

y_test_pred = np.expm1(ada_pipeline.predict(X_test))
y_test_true = np.expm1(y_test)

# --- Metrics (Test set) ---
r2 = r2_score(y_test_true, y_test_pred)
n, p = X_test.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
mae = mean_absolute_error(y_test_true, y_test_pred)
mape = np.mean(np.abs((y_test_true - y_test_pred) / y_test_true)) * 100

print("\nAdaBoost Regressor Performance (Test Set):")
print(f"R¬≤ Score    : {r2:.4f}")
print(f"Adjusted R¬≤ : {adj_r2:.4f}")
print(f"RMSE        : {rmse:,.2f}")
print(f"MAE         : {mae:,.2f}")
print(f"MAPE (%)    : {mape:.2f}%")

# --- Combine train & test for scatter plot ---
combined_df = pd.DataFrame({
    'Actual': np.concatenate([y_train_true, y_test_true]),
    'Predicted': np.concatenate([y_train_pred, y_test_pred]),
    'Set': ['Train'] * len(y_train_true) + ['Test'] * len(y_test_true)
})

# --- Scatter plot: Train & Test ---
plt.figure(figsize=(10, 6), dpi=300)
sns.set(style="whitegrid", context="notebook", font_scale=1.2)
sns.scatterplot(data=combined_df, x='Actual', y='Predicted', hue='Set', alpha=0.6, s=50, edgecolor='w')
min_val = combined_df['Actual'].min()
max_val = combined_df['Actual'].max()
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=1.5, label='Ideal Fit')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("AdaBoost: Predicted vs Actual Prices (Train & Test)")
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()

# --- Residuals distribution (Test set only) ---
residuals_test = y_test_true - y_test_pred
plt.figure(figsize=(10, 5), dpi=300)
sns.histplot(residuals_test, bins=40, kde=True, color='mediumseagreen', edgecolor='black', linewidth=0.6)
plt.axvline(0, color='k', linestyle='--', lw=1.5, label='Zero Error')
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Frequency")
plt.title("AdaBoost: Residuals Distribution (Test Set)")
plt.legend()
plt.tight_layout()
plt.show()


# **Voting Regressor**

In [None]:
import numpy as np
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ---------------------
# Preprocessing
# ---------------------
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
    ]
)

# ---------------------
# Base Models
# ---------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=18,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

gbr = GradientBoostingRegressor(
    n_estimators=700,
    learning_rate=0.03,
    max_depth=6,
    random_state=42
)

# ---------------------
# Voting Regressor inside a Pipeline
# ---------------------
voting_reg = VotingRegressor(
    estimators=[
        ('rf', rf),
        ('gbr', gbr)
    ],
    n_jobs=-1
)

model_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('voting', voting_reg)
])

# ---------------------
# 5-Fold Cross-Validation
# ---------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores, adj_r2_scores, rmse_scores, mae_scores, mape_scores = [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_pipeline.fit(X_tr, y_tr)
    y_val_pred_log = model_pipeline.predict(X_val)
    y_val_pred = np.expm1(y_val_pred_log)
    y_val_true = np.expm1(y_val)

    r2 = r2_score(y_val_true, y_val_pred)
    n, p = X_val.shape
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
    mae = mean_absolute_error(y_val_true, y_val_pred)
    mape = np.mean(np.abs((y_val_true - y_val_pred) / y_val_true)) * 100

    r2_scores.append(r2)
    adj_r2_scores.append(adj_r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    mape_scores.append(mape)

# ---------------------
# Train final model on full training set and evaluate on test set
# ---------------------
model_pipeline.fit(X_train, y_train)
y_test_pred_log = model_pipeline.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)
y_test_true = np.expm1(y_test)

r2_test = r2_score(y_test_true, y_test_pred)
n_test, p_test = X_test.shape
adj_r2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p_test - 1)
rmse_test = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
mae_test = mean_absolute_error(y_test_true, y_test_pred)
mape_test = np.mean(np.abs((y_test_true - y_test_pred) / y_test_true)) * 100

# ---------------------
# Single Accuracy Table
# ---------------------
results = pd.DataFrame({
    "Metric": ["R¬≤ Score", "Adjusted R¬≤", "RMSE", "MAE", "MAPE (%)"],
    "5-Fold CV (Mean)": [
        np.mean(r2_scores),
        np.mean(adj_r2_scores),
        np.mean(rmse_scores),
        np.mean(mae_scores),
        np.mean(mape_scores)
    ],
    "Test Set": [
        r2_test,
        adj_r2_test,
        rmse_test,
        mae_test,
        mape_test
    ]
})

print("\nVoting Regressor Performance Summary:")
print(results.to_string(index=False, float_format=lambda x: f'{x:,.4f}' if abs(x) < 1 else f'{x:,.2f}'))

# ---------------------
# Scatter plot: Train vs Test predictions
# ---------------------
y_train_pred_log = model_pipeline.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)
y_train_true = np.expm1(y_train)

plt.figure(figsize=(7, 6))
sns.scatterplot(x=y_train_true, y=y_train_pred, alpha=0.6, label='Train')
sns.scatterplot(x=y_test_true, y=y_test_pred, alpha=0.6, label='Test')
plt.plot([min(y_train_true.min(), y_test_true.min()), max(y_train_true.max(), y_test_true.max())],
         [min(y_train_true.min(), y_test_true.min()), max(y_train_true.max(), y_test_true.max())],
         'r--', lw=2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Voting Regressor: Train vs Test Predictions")
plt.legend()
plt.tight_layout()
plt.show()

# --import numpy as np
import pandas as pd
import inspect
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# ---------- USER NOTE ----------
# This script assumes y_train and y_test are in the same scale the model should be trained on.
# In your original snippet you used np.expm1(y_test) and np.expm1(y_pred_log) at the end,
# so I keep that behavior (i.e. model is trained on whatever y_train currently contains).
# If you want the script to log-transform targets internally (np.log1p), tell me and I'll change it.

# ---------- Helper: robust OneHotEncoder factory ----------
def make_onehot(**kwargs):
    """
    Build OneHotEncoder compatible with older/newer sklearn versions.
    Will set sparse_output=False if supported, otherwise sparse=False.
    """
    sig = inspect.signature(OneHotEncoder)
    params = kwargs.copy()
    if 'sparse_output' in sig.parameters:
        params.pop('sparse', None)
        params['sparse_output'] = False
    elif 'sparse' in sig.parameters:
        params.pop('sparse_output', None)
        params['sparse'] = False
    else:
        params.pop('sparse', None)
        params.pop('sparse_output', None)
    return OneHotEncoder(**params)

# ---------- Helper: ensure DataFrame ----------
def ensure_dataframe(X, prefix="feat"):
    """
    If X is a DataFrame -> return copy.
    If X is an ndarray -> convert to DataFrame with generic names prefix_0...
    """
    if isinstance(X, pd.DataFrame):
        return X.copy()
    if isinstance(X, np.ndarray):
        cols = [f"{prefix}_{i}" for i in range(X.shape[1])]
        return pd.DataFrame(X.copy(), columns=cols)
    raise ValueError("X must be a pandas DataFrame or NumPy ndarray")

# ---------- Ensure DataFrame inputs ----------
X_train = ensure_dataframe(X_train, prefix="f")
X_test  = ensure_dataframe(X_test,  prefix="f")

# ---------- 1Ô∏è‚É£ Remove Duplicate Columns & Align Train/Test ----------
def drop_duplicate_columns(df):
    # keeps first occurrence of duplicated column names
    return df.loc[:, ~df.columns.duplicated()]

X_train = drop_duplicate_columns(X_train)
X_test  = drop_duplicate_columns(X_test)

# Align test columns to train columns (add missing columns to test as NaN)
X_test = X_test.reindex(columns=X_train.columns, fill_value=np.nan)

# ---------- 2Ô∏è‚É£ Preprocessing with Imputation ----------
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', make_onehot(handle_unknown='ignore'))
        ]), cat_features)
    ],
    remainder='drop'  # drop any other columns
)

# ---------- 3Ô∏è‚É£ Base Learners ----------
cat_model = CatBoostRegressor(
    depth=10, learning_rate=0.05, iterations=1200,
    l2_leaf_reg=3, bagging_temperature=0.3,
    verbose=0, random_seed=42
)

xgb_model = XGBRegressor(
    n_estimators=800, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, random_state=42, n_jobs=-1, tree_method="hist"
)

tree_model = DecisionTreeRegressor(max_depth=12, min_samples_leaf=2, random_state=42)

# ---------- 4Ô∏è‚É£ Stacking Regressor with RidgeCV meta-learner ----------
stacking_reg = StackingRegressor(
    estimators=[
        ('cat', cat_model),
        ('xgb', xgb_model),
        ('tree', tree_model)
    ],
    final_estimator=RidgeCV(alphas=np.logspace(-3, 3, 13)),
    cv=5,
    n_jobs=-1,
    passthrough=True
)

stack_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('stack', stacking_reg)
])

# ---------- 5Ô∏è‚É£ Train & Evaluate ----------
# keep same behavior as your original code: model predicts log-target if y_train is log
stack_pipeline.fit(X_train, y_train)

y_pred_log = stack_pipeline.predict(X_test)

# revert transform (your snippet used expm1)
try:
    y_true_price = np.expm1(y_test)
except Exception:
    # if y_test not convertible to numeric array, coerce then expm1
    y_true_price = np.expm1(np.array(y_test, dtype=float).ravel())

try:
    y_pred_price = np.expm1(y_pred_log)
except Exception:
    y_pred_price = np.expm1(np.array(y_pred_log, dtype=float).ravel())

# ---------- 6Ô∏è‚É£ Metrics ----------
r2 = r2_score(y_true_price, y_pred_price)
n, p = X_test.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1) if (n - p - 1) > 0 else np.nan
rmse = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
mae = mean_absolute_error(y_true_price, y_pred_price)
# protect against division by zero in MAPE
_eps = 1e-9
mape = np.mean(np.abs((y_true_price - y_pred_price) / (np.maximum(np.abs(y_true_price), _eps)))) * 100

print("\n‚úÖ Stacking Regressor Performance (Cleaned + Imputed):")
print(f"R¬≤ Score    : {r2:.6f}")
print(f"Adjusted R¬≤ : {adj_r2 if not np.isnan(adj_r2) else 'N/A (n-p-1<=0)'}")
print(f"RMSE        : {rmse:,.4f}")
print(f"MAE         : {mae:,.4f}")
print(f"MAPE (%)    : {mape:.4f}%")
-------------------
# Residual Distribution Plot (Test Set)
# ---------------------
residuals = y_test_true - y_test_pred
plt.figure(figsize=(7, 6))
sns.histplot(residuals, bins=50, kde=True)
plt.axvline(0, color='red', linestyle='--')
plt.xlabel("Residuals (Actual - Predicted)")
plt.ylabel("Frequency")
plt.title("Voting Regressor: Residual Distribution (Test Set)")
plt.tight_layout()
plt.show()


# **Stacking Regressor**

In [None]:
import numpy as np
import pandas as pd
import inspect
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# ---------- USER NOTE ----------
# This script assumes y_train and y_test are in the same scale the model should be trained on.
# In your original snippet you used np.expm1(y_test) and np.expm1(y_pred_log) at the end,
# so I keep that behavior (i.e. model is trained on whatever y_train currently contains).
# If you want the script to log-transform targets internally (np.log1p), tell me and I'll change it.

# ---------- Helper: robust OneHotEncoder factory ----------
def make_onehot(**kwargs):
    """
    Build OneHotEncoder compatible with older/newer sklearn versions.
    Will set sparse_output=False if supported, otherwise sparse=False.
    """
    sig = inspect.signature(OneHotEncoder)
    params = kwargs.copy()
    if 'sparse_output' in sig.parameters:
        params.pop('sparse', None)
        params['sparse_output'] = False
    elif 'sparse' in sig.parameters:
        params.pop('sparse_output', None)
        params['sparse'] = False
    else:
        params.pop('sparse', None)
        params.pop('sparse_output', None)
    return OneHotEncoder(**params)

# ---------- Helper: ensure DataFrame ----------
def ensure_dataframe(X, prefix="feat"):
    """
    If X is a DataFrame -> return copy.
    If X is an ndarray -> convert to DataFrame with generic names prefix_0...
    """
    if isinstance(X, pd.DataFrame):
        return X.copy()
    if isinstance(X, np.ndarray):
        cols = [f"{prefix}_{i}" for i in range(X.shape[1])]
        return pd.DataFrame(X.copy(), columns=cols)
    raise ValueError("X must be a pandas DataFrame or NumPy ndarray")

# ---------- Ensure DataFrame inputs ----------
X_train = ensure_dataframe(X_train, prefix="f")
X_test  = ensure_dataframe(X_test,  prefix="f")

# ---------- 1Ô∏è‚É£ Remove Duplicate Columns & Align Train/Test ----------
def drop_duplicate_columns(df):
    # keeps first occurrence of duplicated column names
    return df.loc[:, ~df.columns.duplicated()]

X_train = drop_duplicate_columns(X_train)
X_test  = drop_duplicate_columns(X_test)

# Align test columns to train columns (add missing columns to test as NaN)
X_test = X_test.reindex(columns=X_train.columns, fill_value=np.nan)

# ---------- 2Ô∏è‚É£ Preprocessing with Imputation ----------
cat_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', make_onehot(handle_unknown='ignore'))
        ]), cat_features)
    ],
    remainder='drop'  # drop any other columns
)

# ---------- 3Ô∏è‚É£ Base Learners ----------
cat_model = CatBoostRegressor(
    depth=10, learning_rate=0.05, iterations=1200,
    l2_leaf_reg=3, bagging_temperature=0.3,
    verbose=0, random_seed=42
)

xgb_model = XGBRegressor(
    n_estimators=800, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, random_state=42, n_jobs=-1, tree_method="hist"
)

tree_model = DecisionTreeRegressor(max_depth=12, min_samples_leaf=2, random_state=42)

# ---------- 4Ô∏è‚É£ Stacking Regressor with RidgeCV meta-learner ----------
stacking_reg = StackingRegressor(
    estimators=[
        ('cat', cat_model),
        ('xgb', xgb_model),
        ('tree', tree_model)
    ],
    final_estimator=RidgeCV(alphas=np.logspace(-3, 3, 13)),
    cv=5,
    n_jobs=-1,
    passthrough=True
)

stack_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('stack', stacking_reg)
])

# ---------- 5Ô∏è‚É£ Train & Evaluate ----------
# keep same behavior as your original code: model predicts log-target if y_train is log
stack_pipeline.fit(X_train, y_train)

y_pred_log = stack_pipeline.predict(X_test)

# revert transform (your snippet used expm1)
try:
    y_true_price = np.expm1(y_test)
except Exception:
    # if y_test not convertible to numeric array, coerce then expm1
    y_true_price = np.expm1(np.array(y_test, dtype=float).ravel())

try:
    y_pred_price = np.expm1(y_pred_log)
except Exception:
    y_pred_price = np.expm1(np.array(y_pred_log, dtype=float).ravel())

# ---------- 6Ô∏è‚É£ Metrics ----------
r2 = r2_score(y_true_price, y_pred_price)
n, p = X_test.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1) if (n - p - 1) > 0 else np.nan
rmse = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
mae = mean_absolute_error(y_true_price, y_pred_price)
# protect against division by zero in MAPE
_eps = 1e-9
mape = np.mean(np.abs((y_true_price - y_pred_price) / (np.maximum(np.abs(y_true_price), _eps)))) * 100

print("\n‚úÖ Stacking Regressor Performance (Cleaned + Imputed):")
print(f"R¬≤ Score    : {r2:.6f}")
print(f"Adjusted R¬≤ : {adj_r2 if not np.isnan(adj_r2) else 'N/A (n-p-1<=0)'}")
print(f"RMSE        : {rmse:,.4f}")
print(f"MAE         : {mae:,.4f}")
print(f"MAPE (%)    : {mape:.4f}%")


In [None]:
import numpy as np, pandas as pd, sklearn, xgboost, catboost, lightgbm

print("Library Versions:")
print(f"NumPy      : {np.__version__}")
print(f"Pandas     : {pd.__version__}")
print(f"Scikit-Learn: {sklearn.__version__}")
print(f"XGBoost    : {xgboost.__version__}")
print(f"CatBoost   : {catboost.__version__}")
print(f"LightGBM   : {lightgbm.__version__}")
