In [28]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Sample dataset with non-numeric values (e.g., 'RL', 'RM', etc.)
data = {
    'HouseType': ['RL', 'RM', 'RL', 'RM', 'RL', 'RM'],
    'Rooms': [4, 3, 5, 2, 4, 3],
    'Price': [250000, 180000, 300000, 150000, 270000, 160000]
}

# Create a DataFrame from the sample data
df = pd.DataFrame(data)

# Perform one-hot encoding to convert the 'HouseType' column to numeric
df_encoded = pd.get_dummies(df, columns=['HouseType'])

# Separate features and target
X = df_encoded.drop('Price', axis=1)
Y = df_encoded['Price']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, Y_train)

# Create and train the Decision Tree Regression model
dt = DecisionTreeRegressor()
dt.fit(X_train, Y_train)

# Create and train the K-Nearest Neighbors Regression model
knn = KNeighborsRegressor(n_neighbors=3)  # Use 3 neighbors
knn.fit(X_train, Y_train)

# Make predictions using the three models
lr_pred = lr.predict(X_test)
dt_pred = dt.predict(X_test)
knn_pred = knn.predict(X_test)

# Calculate R-squared scores for each model
lr_r2 = r2_score(Y_test, lr_pred)
dt_r2 = r2_score(Y_test, dt_pred)
knn_r2 = r2_score(Y_test, knn_pred)

# Print the R-squared scores
print("Linear Regression R-squared: ", lr_r2)
print("Decision Tree R-squared: ", dt_r2)
print("K-Nearest Neighbors R-squared: ", knn_r2)


Linear Regression R-squared:  0.653061224489796
Decision Tree R-squared:  0.6734693877551021
K-Nearest Neighbors R-squared:  0.909297052154195


In [29]:
from sklearn.ensemble import BaggingRegressor

bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)

In [30]:
Y_preds = bag_regressor.predict(X_test)

print('Training Coefficient of R^2 : %.3f'%bag_regressor.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f'%bag_regressor.score(X_test, Y_test))

Training Coefficient of R^2 : 0.992
Test Coefficient of R^2 : 0.422


In [31]:
%%time

n_samples = boston.data.shape[0]
n_features = boston.data.shape[1]

params = {'base_estimator': [None, LinearRegression(), KNeighborsRegressor()],
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, Y_train)

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.834
Test R^2 Score : 0.971
Best R^2 Score Through Grid Search : nan
Best Parameters :  {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': True, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 20}
CPU times: total: 1.17 s
Wall time: 17 s


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
