# Stepwise regression

https://dataaspirant.com/stepwise-regression/


In [1]:
#importing all the modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

#____loading data____
df = pd.read_csv('../data/clean_house.csv')
# df = pd.read_csv('./data/clean_app.csv')

# ____Outliers handling_____
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

max_value = Q3 + (1.5 * IQR)
min_value = Q1 - (1.5 * IQR)

outliers_mask = (df["price"] < min_value) | (df["price"] > max_value)
df.loc[outliers_mask, "price"] = np.nan

df.dropna(subset=["price"], inplace=True)

# _____dropping unnecessary columns____
# for house and app
columns_to_drop = ['property_id', 'latitude', 'longitude', 'property_type', 'type_of_sale', 'fully_equipped_kitchen']

df.drop(columns_to_drop, axis=1, inplace=True)

# _____imputing missing values for swimmingpool____
constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
df[['swimming_pool']] = constant_imputer.fit_transform(df[['swimming_pool']])

# _____one-hot encoding for kitchen_type____
df = pd.get_dummies(df, columns=["kitchen_type"], prefix="kitchen_type")

# _____one-hot encoding for state_of_building____
df = pd.get_dummies(df, columns=["state_of_building"], prefix="state_of_building")

# _____one-hot encoding for property_subtype____
df = pd.get_dummies(df, columns=["property_subtype"], prefix="property_subtype")

# _____one-hot encoding for province____
df = pd.get_dummies(df, columns=["province"], prefix="province")

# _____defining target and features____

# columns_to_drop = ['price', 'number_of_rooms', 'terrace_area', 'garden_area', 'furnished', 'garden', 'terrace', 'number_of_facades', 'locality_name', 'main_city', 'province']
columns_to_drop = ['price', 'locality_name', 'main_city']

# Drop the specified columns
X = df.drop(columns=columns_to_drop, axis=1)

y = df['price']

### Filling up NaN values

- !!! data leakage, as it is done before training the model !!!

In [2]:
# ____More Preprocess of the Data_____
imp = SimpleImputer(strategy='median')
X = imp.fit_transform(X)

In [3]:
# _____Select the Top Features_____
X_new = SelectKBest(f_regression, k=6).fit_transform(X, y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [5]:
#default model
lr = LinearRegression()
lr.fit(X_train,y_train)
# Predict the target values on the testing set
y_pred = lr.predict(X_test)

# Calculate the R^2 score
r2 = lr.score(X_test, y_test)


print("R^2 score using default", r2)

R^2 score using default 0.4461435253892658


In [6]:
# Initialize a linear regression model
lr = LinearRegression()

# Use SFS to perform stepwise feature selection
sfs = SequentialFeatureSelector(lr, n_features_to_select=5, direction='backward', cv=10)
sfs.fit(X_train, y_train)

# Get the selected feature indices
feature_idx = sfs.get_support(indices=True)

# Print the selected feature indices
print("Selected feature indices:", feature_idx)

Selected feature indices: [0 1 2 4 5]


In [7]:
# Select the top features based on the selected indices
X_train_sfs = X_train[:, feature_idx]
X_test_sfs = X_test[:, feature_idx]

# Train a linear regression model on the training set with the selected features
lr_sfs = LinearRegression()
lr_sfs.fit(X_train_sfs, y_train)

# Predict the target values on the testing set
y_pred_sfs = lr_sfs.predict(X_test_sfs)

# Calculate the R^2 score
r2_sfs = lr_sfs.score(X_test_sfs, y_test)

print("R^2 score using SFS", r2_sfs)

R^2 score using SFS 0.44554407413496944
