In [1]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=f_regression, k=n)
    fit1 = test.fit(indep_X, dep_Y)
    # Transform and return selected features
    selectk_features = fit1.transform(indep_X)
    return selectk_features

# Function to split data and apply scaling
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

# Function to calculate R^2 score
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    return r2_score(y_test, y_pred)

# Linear Regression
def linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

# SVM with Grid Search for Linear Kernel
def svm_linear_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'epsilon': [0.01, 0.1, 1]}
    grid = GridSearchCV(SVR(kernel='linear'), param_grid, scoring='r2', cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    r2 = r2_prediction(best_model, X_test, y_test)
    return r2, grid.best_params_

# SVM with Grid Search for RBF Kernel
def svm_rbf_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 0.01, 0.1], 'epsilon': [0.01, 0.1, 1]}
    grid = GridSearchCV(SVR(kernel='rbf'), param_grid, scoring='r2', cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    r2 = r2_prediction(best_model, X_test, y_test)
    return r2, grid.best_params_

# Decision Tree with Grid Search
def decision_tree_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 10, 20]}
    grid = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid, scoring='r2', cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    r2 = r2_prediction(best_model, X_test, y_test)
    return r2, grid.best_params_

# Random Forest with Grid Search
def random_forest_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
    grid = GridSearchCV(RandomForestRegressor(random_state=0), param_grid, scoring='r2', cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    r2 = r2_prediction(best_model, X_test, y_test)
    return r2, grid.best_params_

# Main Code
dataset1 = pd.read_csv("PreprocessedHousing.csv", index_col=None)
df2 = dataset1.copy()
df2 = pd.get_dummies(df2, drop_first=True)

indep_X = df2.drop('price', axis=1)
dep_Y = df2['price']

# Select top 10 features
kbest = selectkbest(indep_X, dep_Y, 13)

# Split and scale the data
X_train, X_test, y_train, y_test = split_scalar(kbest, dep_Y)

# Linear Regression
r2_lin = linear(X_train, y_train, X_test, y_test)

# SVM with Grid Search (Linear Kernel)
r2_svml, best_params_svml = svm_linear_gridsearch(X_train, y_train, X_test, y_test)

# SVM with Grid Search (RBF Kernel)
r2_svmnl, best_params_svmnl = svm_rbf_gridsearch(X_train, y_train, X_test, y_test)

# Decision Tree with Grid Search
r2_dec, best_params_dec = decision_tree_gridsearch(X_train, y_train, X_test, y_test)

# Random Forest with Grid Search
r2_rf, best_params_rf = random_forest_gridsearch(X_train, y_train, X_test, y_test)

# Results
result = pd.DataFrame({
    'Model': ['Linear Regression', 'SVM (Linear)', 'SVM (RBF)', 'Decision Tree', 'Random Forest'],
    'R2 Score': [r2_lin, r2_svml, r2_svmnl, r2_dec, r2_rf],
    'Best Params': [None, best_params_svml, best_params_svmnl, best_params_dec, best_params_rf]
})

print(result)


               Model  R2 Score                                   Best Params
0  Linear Regression  0.672342                                          None
1       SVM (Linear) -0.009492                    {'C': 10, 'epsilon': 0.01}
2          SVM (RBF) -0.015691      {'C': 10, 'epsilon': 0.01, 'gamma': 0.1}
3      Decision Tree  0.461000  {'max_depth': None, 'min_samples_split': 20}
4      Random Forest  0.570409        {'max_depth': 20, 'n_estimators': 100}
