**Regresão base de dados Microsoft**

**Guilherme Henrique Pereira Serafini - 2021.1.08.048**


**Vinícius Eduardo de Souza Honório - 2021.1.08.024**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Carregar os dados
data = pd.read_csv('https://raw.githubusercontent.com/megaVE/LearningVerification2/main/Microsoft_Stock.csv')
columns = [ "Date", "Volume"] + [f'Feature_{i}' for i in range(1, 5)]
data.columns = columns

data = data.dropna()

X = data.drop(['Date', 'Volume'], axis=1)
y = data['Volume']

X = StandardScaler().fit_transform(X)
y = LabelEncoder().fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dividir o conjunto de treinamento em 4 subconjuntos
X_train_subconjuntos = []
y_train_subconjuntos = []
for _ in range(4):
    X_subconjunto, _, y_subconjunto, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    X_train_subconjuntos.append(X_subconjunto)
    y_train_subconjuntos.append(y_subconjunto)

#################################################################################################################
random_forest_params_variations = []
while len(random_forest_params_variations) < 15:
    new_params_forest = {
      'n_estimators': np.random.choice([1, 3, 5]),
      'max_depth': np.random.choice([None, 5, 10]),
      'min_samples_split': np.random.choice([2, 5, 10])
    }
    if new_params_forest not in random_forest_params_variations:
        random_forest_params_variations.append(new_params_forest)


random_forest_modelos = []
for variation in random_forest_params_variations:
    random_forest_model = RandomForestRegressor(**variation)
    random_forest_model1 = RandomForestRegressor(**variation)
    random_forest_model2 = RandomForestRegressor(**variation)
    random_forest_model3 = RandomForestRegressor(**variation)
    random_forest_model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    random_forest_model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    random_forest_model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    random_forest_model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])
    random_forest_modelos.append(random_forest_model)
    random_forest_modelos.append(random_forest_model1)
    random_forest_modelos.append(random_forest_model2)
    random_forest_modelos.append(random_forest_model3)

xgb_params_variations = []
while len(xgb_params_variations) < 15:
    new_params_xgb = {
        'n_estimators': np.random.choice([50, 100, 200]),
        'learning_rate': np.random.choice([0.1, 0.01, 0.001]),
        'max_depth': np.random.choice([3, 5, 7, 9])
    }
    if new_params_xgb not in xgb_params_variations:
        xgb_params_variations.append(new_params_xgb)


xgb_modelos = []
for variation in xgb_params_variations:
    model = XGBRegressor(**variation)
    model1 = XGBRegressor(**variation)
    model2 = XGBRegressor(**variation)
    model3 = XGBRegressor(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    xgb_modelos.append(model)
    xgb_modelos.append(model1)
    xgb_modelos.append(model2)
    xgb_modelos.append(model3)

svr_params_variations = []
while len(svr_params_variations) < 15:
    svr_params = {
        'C': np.random.choice([0.1, 1.0, 10.0]),
        'kernel': np.random.choice(['linear', 'rbf', 'poly']),
        'degree': np.random.choice([2, 3, 4]),
    }
    if svr_params not in svr_params_variations:
        svr_params_variations.append(svr_params)

svr_modelos = []
for variation in svr_params_variations:
    model = SVR(**variation)
    model1 = SVR(**variation)
    model2 = SVR(**variation)
    model3 = SVR(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    svr_modelos.append(model)
    svr_modelos.append(model1)
    svr_modelos.append(model2)
    svr_modelos.append(model3)

decision_tree_params_variations = []
while len(decision_tree_params_variations) < 15:
    decision_tree_params = {
      'max_depth': np.random.choice([None, 5, 10]),
      'min_samples_split': np.random.choice([2, 5, 10]),
      'min_samples_leaf': np.random.choice([1, 2, 4])
    }
    if decision_tree_params not in decision_tree_params_variations:
        decision_tree_params_variations.append(decision_tree_params)

decision_tree_modelos = []
for variation in decision_tree_params_variations:
    model = DecisionTreeRegressor(**variation)
    model1 = DecisionTreeRegressor(**variation)
    model2 = DecisionTreeRegressor(**variation)
    model3 = DecisionTreeRegressor(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    decision_tree_modelos.append(model)
    decision_tree_modelos.append(model1)
    decision_tree_modelos.append(model2)
    decision_tree_modelos.append(model3)

model_accuracies = []
# Loop para aplicar cada modelo aos dados de teste e obter as métricas de regressão
for model in random_forest_modelos + xgb_modelos + svr_modelos + decision_tree_modelos:
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_accuracies.append((mse, r2))

top_indices = sorted(range(len(model_accuracies)), key=lambda i: model_accuracies[i][1], reverse=True)[:24]

modelos_melhores = []

modelos_lista = []

for i in top_indices:
    if i < len(random_forest_modelos):
        model_name = 'Random Forest'
        model = random_forest_modelos[i]
    elif i < len(random_forest_modelos) + len(xgb_modelos):
        model_name = 'XGBoost'
        model = xgb_modelos[i - len(random_forest_modelos)]
    elif i < len(random_forest_modelos) + len(xgb_modelos) + len(svr_modelos):
        model_name = 'SVR'
        model = svr_modelos[i - len(random_forest_modelos) - len(xgb_modelos)]
    else:
        model_name = 'Decision Tree'
        model = decision_tree_modelos[i - len(random_forest_modelos) - len(xgb_modelos) - len(svr_modelos)]

    modelos_melhores.append((model_name, model_accuracies[i][1], model_accuracies[i][0]))
    modelos_lista.append(model)

# Imprime os modelos selecionados e suas métricas
print("Modelos selecionados:")
for model_name, r2, mse in modelos_melhores:
    print(f"{model_name} - R2: {r2:.4f}, MSE: {mse:.4f}")


Modelos selecionados:
Random Forest - R2: 0.9995, MSE: 86.3547
Random Forest - R2: 0.9994, MSE: 90.1847
Random Forest - R2: 0.9994, MSE: 90.3975
Random Forest - R2: 0.9994, MSE: 90.4036
Random Forest - R2: 0.9994, MSE: 90.5811
Random Forest - R2: 0.9994, MSE: 91.4553
Random Forest - R2: 0.9994, MSE: 92.2941
Random Forest - R2: 0.9994, MSE: 92.6480
Random Forest - R2: 0.9994, MSE: 93.8475
XGBoost - R2: 0.9994, MSE: 95.0645
XGBoost - R2: 0.9994, MSE: 95.0645
XGBoost - R2: 0.9994, MSE: 95.0645
XGBoost - R2: 0.9994, MSE: 95.0645
Random Forest - R2: 0.9994, MSE: 95.8904
XGBoost - R2: 0.9994, MSE: 96.8609
XGBoost - R2: 0.9994, MSE: 96.8609
XGBoost - R2: 0.9994, MSE: 96.8609
XGBoost - R2: 0.9994, MSE: 96.8609
Random Forest - R2: 0.9994, MSE: 96.9139
Random Forest - R2: 0.9994, MSE: 97.3079
Random Forest - R2: 0.9994, MSE: 97.6190
Random Forest - R2: 0.9994, MSE: 98.0767
Random Forest - R2: 0.9994, MSE: 99.6165
Random Forest - R2: 0.9994, MSE: 100.2553


In [None]:
# Inicializar uma matriz para armazenar as previsões de cada modelo
previsoes_modelos = np.zeros((len(modelos_lista), len(X_test)))

# Fazer previsões para cada modelo em X_teste
for i, modelo in enumerate(modelos_lista):
    previsoes_modelos[i] = modelo.predict(X_test)

# Calcular a média das previsões de todos os modelos para obter a previsão final
previsao_final = np.mean(previsoes_modelos, axis=0)

# Compare as previsões finais com os valores reais em y_teste
acuracia_ensemble = r2_score(y_test, previsao_final)
print("Metrica de erro quadratico do ensemble usando a média:", acuracia_ensemble)


Metrica de erro quadratico do ensemble usando a média: 0.9994870996187853
