In [1]:

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification,  make_regression
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import scipy.stats as stats


In [2]:
X, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_clusters_per_class=1,
    random_state=60
)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)


In [4]:
pipeline_cl = make_pipeline(StandardScaler(), PCA(), DecisionTreeClassifier(random_state=60))

In [5]:
param_grid = {
    'pca__n_components': np.linspace(0.00001, 0.9999, 60, dtype=float),
    'decisiontreeclassifier__max_depth': np.linspace(1, 30, 30, dtype=int),
    'decisiontreeclassifier__min_samples_split': np.linspace(2, 51, 50, dtype=int),
    'decisiontreeclassifier__min_samples_leaf': np.linspace(1, 50, 50, dtype=int),
    'decisiontreeclassifier__max_features': ['sqrt', 'log2', None],
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
}


In [6]:
grid_cl = RandomizedSearchCV(pipeline_cl, param_grid,n_iter = 100, n_jobs = 7, random_state= 60)
grid_cl.fit(X_train, y_train)

In [7]:
pred_cl = grid_cl.predict(X_test)
print("Best parameter: ", grid_cl.best_params_)
print("MSE: ", mean_squared_error(y_test,pred_cl))

Best parameter:  {'pca__n_components': 0.9829527118644068, 'decisiontreeclassifier__min_samples_split': 27, 'decisiontreeclassifier__min_samples_leaf': 14, 'decisiontreeclassifier__max_features': None, 'decisiontreeclassifier__max_depth': 11, 'decisiontreeclassifier__criterion': 'gini'}
MSE:  0.095


In [8]:
best_model = grid_cl.best_estimator_
tree_clf = best_model.named_steps['decisiontreeclassifier']


In [9]:
X, y = make_regression(
    n_samples=1000,
    n_features=5,
    noise=0.1,
    random_state=60
)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=60 )

In [11]:
pipeline_rg = make_pipeline(StandardScaler(), PCA(), DecisionTreeRegressor( random_state=60))

In [12]:
param_grid = {
    'decisiontreeregressor__criterion': ['squared_error', 'absolute_error'],
    'decisiontreeregressor__splitter': ['best', 'random'],
    'decisiontreeregressor__max_depth': [3, 5, 10, None],
    'decisiontreeregressor__min_samples_split': [5, 10, 20, 30],
    'decisiontreeregressor__min_samples_leaf': [5, 10, 20, 30],
    'decisiontreeregressor__max_features': [0.2, 0.4, 0.6, None],
    'decisiontreeregressor__max_leaf_nodes': [None, 20, 50, 100],
    'decisiontreeregressor__min_impurity_decrease': [0.0, 0.01, 0.02]
}

In [14]:
grid_rg = GridSearchCV(pipeline_rg, param_grid, n_jobs = 7, verbose = 3, cv = 3)
grid_rg.fit(X_train, y_train)

Fitting 3 folds for each of 12288 candidates, totalling 36864 fits


In [16]:
pred_rg = grid_rg.predict(X_test)
print("Best parameter: ", grid_rg.best_params_)
print("MSE: ", mean_squared_error(y_test ,pred_rg))

Best parameter:  {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_depth': 10, 'decisiontreeregressor__max_features': None, 'decisiontreeregressor__max_leaf_nodes': None, 'decisiontreeregressor__min_impurity_decrease': 0.0, 'decisiontreeregressor__min_samples_leaf': 5, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'best'}
MSE:  1860.5541997031464


In [40]:

X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
y = pd.Series(y, name="target")

split_s = ShuffleSplit(n_splits=10000, test_size=0.3, random_state=60)

models = []
predictions = []


for train_index, test_index in split_s.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    pipeline_rg_random = DecisionTreeRegressor()
    
    
    
    pipeline_rg_random.fit(X_train, y_train)
    pred = pipeline_rg_random.predict(X_test)
    models.append(random_search)
    predictions.append(pred)




In [41]:
predictions = np.array(predictions)


mode_predictions, _ = stats.mode(predictions, axis=0, keepdims=True)
mode_predictions = mode_predictions.flatten()


mse = mean_squared_error(y_test, mode_predictions)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 25039.57952640102


In [26]:
param_grid = {
    'randomforestregressor__n_estimators': [100, 200],
    'randomforestregressor__max_depth': [3, 5, 10, None],
    'randomforestregressor__min_samples_split': [5, 10, 20],
    'randomforestregressor__min_samples_leaf': [5, 10, 20],
    'randomforestregressor__max_features': [0.2, 0.6, None],
    'randomforestregressor__max_leaf_nodes': [None, 20, 50],
    'randomforestregressor__min_impurity_decrease': [0.0, 0.01, 0.02]
}


In [27]:
modelo_random = make_pipeline(
        StandardScaler(), 
        PCA(n_components=min(X_train.shape[1], 5)),  
        RandomForestRegressor(random_state=60)
    )

In [28]:
random_search = GridSearchCV(
        modelo_random, 
        param_grid, 
        n_jobs=7, 
        scoring='neg_mean_squared_error',
        cv=3
    )
    
random_search.fit(X_train, y_train)
pred = random_search.predict(X_test)


In [29]:
mean_squared_error(y_test, pred)

1663.8837364449014