In [1]:
#Regular

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Laad de data
data = pd.read_csv('UniversalBank.csv')

# Gebruik een kleiner deel van de data voor prestaties
data_subset = data.sample(frac=0.02, random_state=42)

# Verwijder kolommen zonder nuttige data
X = data_subset.drop(columns=['ID', 'ZIP Code'])
y = data_subset['Income']

# Split de data in train en test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definieer een pipeline
simple_model = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

# Definieer de parameter grid
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 5, 10],
    'rf__min_samples_split': [2, 5, 10]
}

# Maak een GridSearchCV object
grid_search = GridSearchCV(simple_model, param_grid, cv=3, scoring='neg_mean_squared_error')

# Pas het object toe op de data
grid_search.fit(X_train, y_train)

# Haal de beste parameters en het beste model op
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Voorspel voor test data
y_pred = best_model.predict(X_test)

# Evaluatiemetrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 3.9437432075497787
R-squared: 0.9985278429170505


In [2]:
#Extremely randomized Trees

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import randint

# Laad de data
data = pd.read_csv('UniversalBank.csv')

# Gebruik een kleiner deel van de data voor prestaties
data_subset = data.sample(frac=0.02, random_state=42)

# Verwijder kolommen zonder nuttige data
X = data_subset.drop(columns=['ID', 'ZIP Code'])
y = data_subset['Income']

# Split de data in train en test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definieer een pipeline
simple_model = Pipeline([
    ('scaler', StandardScaler()),
    ('etr', ExtraTreesRegressor())
])

# Definieer de parameterdistributie
param_dist = {
    'etr__n_estimators': randint(50, 200),
    'etr__max_depth': [None, 5, 10],
    'etr__min_samples_split': randint(2, 10)
}

# Maak een RandomizedSearchCV object
random_search = RandomizedSearchCV(simple_model, param_dist, n_iter=50, cv=3, scoring='neg_mean_squared_error', random_state=42)

# Pas het object toe op de data
random_search.fit(X_train, y_train)

# Haal de beste parameters en het beste model op
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Voorspel voor test data
y_pred = best_model.predict(X_test)

# Evaluatiemetrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 2.224247348659514
R-squared: 0.999169712297116
