In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

def forward_selection(X, y, model, n_features):
    selected_features = []
    best_r2 = 0
    
    for _ in range(n_features):
        remaining_features = [f for f in X.columns if f not in selected_features]
        candidate_r2 = 0
        best_feature = None
        
        for feature in remaining_features:
            current_features = selected_features + [feature]
            X_subset = X[current_features]
            
            X_train, X_test, y_train, y_test = split_scalar(X_subset, y)
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            
            if r2 > candidate_r2:
                candidate_r2 = r2
                best_feature = feature
        
        if candidate_r2 > best_r2:
            best_r2 = candidate_r2
            selected_features.append(best_feature)
        else:
            break

    return selected_features

def Linear(X_train, y_train, X_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def svm_linear(X_train, y_train, X_test):
    regressor = SVR(kernel='linear')
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def Decision(X_train, y_train, X_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def random(X_train, y_train, X_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)

indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

# Set the model and the number of features to select
model = LinearRegression()
n_features = 5

# Perform forward selection
selected_features = forward_selection(indep_X, dep_Y, model, n_features)

# Print the selected features
print("Selected Features:", selected_features)

# Train models using the selected features
X_selected = indep_X[selected_features]
X_train, X_test, y_train, y_test = split_scalar(X_selected, dep_Y)

r2_lin = Linear(X_train, y_train, X_test)
r2_d = Decision(X_train, y_train, X_test)
r2_rf = random(X_train, y_train, X_test)

result = pd.DataFrame(index=['Linear', 'Decision', 'Random'], columns=['R2'])
result['R2']['Linear'] = r2_lin
result['R2']['Decision'] = r2_d
result['R2']['Random'] = r2_rf

print("\nResults with selected features:")
result

Selected Features: ['hrmo', 'sg_d', 'al', 'sg_b', 'dm_yes']

Results with selected features:


Unnamed: 0,R2
Linear,0.701066
Decision,0.96858
Random,0.922505
