In [10]:

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import scipy.stats as stats


In [11]:
X, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_clusters_per_class=1,
    random_state=60
)

MNIST - Shape de X_train: (60000, 784), Shape de X_test: (10000, 784)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)


In [13]:
pipeline_cl = make_pipeline(StandardScaler(), PCA(), DecisionTreeClassifier(random_state=60))

In [14]:
param_grid = {
    'pca__n_components': np.linspace(0.00001, 0.9999, 60, dtype=float),
    'decisiontreeclassifier__max_depth': np.linspace(1, 30, 30, dtype=int),
    'decisiontreeclassifier__min_samples_split': np.linspace(2, 51, 50, dtype=int),
    'decisiontreeclassifier__min_samples_leaf': np.linspace(1, 50, 50, dtype=int),
    'decisiontreeclassifier__max_features': ['sqrt', 'log2', None],
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
}


In [None]:
grid_cl = RandomizedSearchCV(pipeline_cl, param_grid,n_iter = 100, n_jobs = 7, random_state= 60)
grid_cl.fit(X_train, y_train)

In [None]:
pred_cl = grid_cl.predict(X_test)
print("Best parameter: ", grid_cl.best_params_)
print("MSE: ", mean_squared_error(y_test,pred_cl))

In [None]:
best_model = grid_cl.best_estimator_
tree_clf = best_model.named_steps['decisiontreeclassifier']


In [None]:

data_housing = fetch_california_housing(as_frame=True)


feature_names_housing = data_housing.feature_names
X_housing = data_housing.data
target_housing = data_housing.target

print(feature_names_housing)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_housing, target_housing, test_size = 0.3, random_state=60 )

In [None]:
pipeline_rg = make_pipeline(StandardScaler(), PCA(), DecisionTreeRegressor( random_state=60))

In [None]:
param_grid = {
    'decisiontreeregressor__criterion': ['squared_error', 'absolute_error'],
    'decisiontreeregressor__splitter': ['best', 'random'],
    'decisiontreeregressor__max_depth': [3, 5, 10, None],
    'decisiontreeregressor__min_samples_split': [5, 10, 20, 30],
    'decisiontreeregressor__min_samples_leaf': [5, 10, 20, 30],
    'decisiontreeregressor__max_features': [0.2, 0.4, 0.6, None],
    'decisiontreeregressor__max_leaf_nodes': [None, 20, 50, 100],
    'decisiontreeregressor__min_impurity_decrease': [0.0, 0.01, 0.02]
}

In [None]:
grid_rg = GridSearchCV(pipeline_rg, param_grid, n_jobs = 7, verbose = 3, cv = 3)
grid_rg.best_params_.fit(X_train, y_train)

In [None]:
pred_rg = grid_rg.best_params_.predict(X_test)
print("Best parameter: ", grid_rg.best_params_)
print("MSE: ", mean_squared_error(y_test ,pred_rg))

In [None]:
param_grid = {
    'decisiontreeregressor__criterion': ['squared_error', 'absolute_error'],
    'decisiontreeregressor__splitter': ['random'],
    'decisiontreeregressor__max_depth': [3, 5, 10, None],
    'decisiontreeregressor__min_samples_split': [5, 10, 20],
    'decisiontreeregressor__min_samples_leaf': [5, 10, 20],
    'decisiontreeregressor__max_features': [0.2, 0.6, None],
    'decisiontreeregressor__max_leaf_nodes': [None, 20, 50],
    'decisiontreeregressor__min_impurity_decrease': [0.0, 0.01, 0.02]
}

In [None]:
split_s = ShuffleSplit(n_splits=4, test_size=0.3, random_state=60)

models = []
predictions = []

for train_index, test_index in split_s.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    pipeline_rg_random = make_pipeline(
        StandardScaler(), 
        PCA(n_components=min(X_train.shape[1], 10)),  
        DecisionTreeRegressor(random_state=60)
    )
    
    random_search = GridSearchCV(
        pipeline_rg_random, 
        param_grid, 
        n_jobs=7, 
        scoring='neg_mean_squared_error',
        cv=3
    )
    
    random_search.fit(X_train, y_train)
    pred = random_search.predict(X_test)
    models.append(random_search.best_estimator_)
    predictions.append(pred)



In [None]:
predictions = np.array(predictions)
print(predictions)
mode_predictions, _ = stats.mode(predictions, axis=0, keepdims=True)
mode_predictions = mode_predictions.flatten()

mse = mean_squared_error(y_test, mode_predictions)
print("Mean Squared Error (MSE):", mse)

In [None]:
param_grid = {
    'randomforestregressor__n_estimators': [100, 200],
    'randomforestregressor__max_depth': [3, 5, 10, None],
    'randomforestregressor__min_samples_split': [5, 10, 20],
    'randomforestregressor__min_samples_leaf': [5, 10, 20],
    'randomforestregressor__max_features': [0.2, 0.6, None],
    'randomforestregressor__max_leaf_nodes': [None, 20, 50],
    'randomforestregressor__min_impurity_decrease': [0.0, 0.01, 0.02]
}


In [None]:
modelo_random = make_pipeline(
        StandardScaler(), 
        PCA(n_components=min(X_train.shape[1], 5)),  
        RandomForestRegressor(random_state=60)
    )

In [None]:
random_search = GridSearchCV(
        modelo_random, 
        param_grid, 
        n_jobs=7, 
        scoring='neg_mean_squared_error',
        cv=3
    )
    
random_search.fit(X_train, y_train)
pred = random_search.predict(X_test)


In [None]:
mean_squared_error(y_test, pred)