### Import Neccessary libraries for Best Model Selection

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split , cross_val_score , RandomizedSearchCV
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error , mean_absolute_error 


In [2]:
#Load the dataset
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
#Features and label
#Labels
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

In [4]:
#Separating Features
num_features = ['carat' , 'depth', 'table' , 'x', 'y', 'z']
cate_features = [col for col in X.columns if col not in num_features]

#Pre_Processing
preprocessing = ColumnTransformer([
    ('num' , StandardScaler() , num_features),
    ('cate' , OneHotEncoder() , cate_features)
])

In [5]:
#All models
models = {
    'LinearRegression' : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'KNeighborsRegressor' : KNeighborsRegressor(),
    'GradientBoostingRegressor' : GradientBoostingRegressor(),
    'XGBRegressor' : XGBRegressor(),
    'RandomForestRegressor' : RandomForestRegressor()
}

In [6]:

for name,model in models.items():
    pipeline = Pipeline([
        ('preprocessing' , preprocessing),
        ('model' , model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, n_jobs=-1 , scoring='neg_mean_absolute_error')
    mae = -scores.mean()
    print(f"{name}: Mean Absolute Error = {mae:.4f}")

LinearRegression: Mean Absolute Error = 742.8340
DecisionTreeRegressor: Mean Absolute Error = 377.2043
KNeighborsRegressor: Mean Absolute Error = 433.0429
GradientBoostingRegressor: Mean Absolute Error = 408.0043
XGBRegressor: Mean Absolute Error = 293.7107
RandomForestRegressor: Mean Absolute Error = 280.5791


### Best Model Prooved `RandomForestRegressor`

In [7]:
#Random forest Tuning
rf_params_grid = {
    'model__n_estimators' : [100,200,300],
    'model__max_depth' : [None , 10, 20 ,30],
    'model__min_samples_split' : [2, 5, 7],
    'model__min_samples_leaf' : [1,3,5]
}

rf_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model' , RandomForestRegressor())
])

rf_search = RandomizedSearchCV(
    rf_pipeline,
    rf_params_grid,
    cv=3,
    n_jobs=-1,
    scoring='neg_mean_absolute_error',
    random_state=42
)

rf_search.fit(X_train, y_train)

print("Best RF Params:", rf_search.best_params_)
print("Best RF MAE:", -rf_search.best_score_)

# Evaluate on test set
y_pred_rf = rf_search.predict(X_test)
print("Test MAE (RF):", mean_absolute_error(y_test, y_pred_rf))

Best RF Params: {'model__n_estimators': 200, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_depth': None}
Best RF MAE: 279.33813889149036
Test MAE (RF): 270.0931284303242


### Before Tuning, MAE Of `RandomForestRegressor` was 281.0377
### After Tuning, MAE Of `RandomForestRegressor` now is  268.7785