## Feature selection by Recursive Feature Elimination(RFE)
### Dataset - Numerical Input and Numerical Output

In [48]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score

In [49]:
House = pd.read_csv('D:\Data for Preprocessing\Housing.csv')

In [50]:
House.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,7420.0,4.0,2.0,3.0,yes,no,no,no,yes,2.0,yes,furnished,1330000
1,8960.0,4.0,4.0,4.0,yes,no,no,no,yes,3.0,no,furnished,1225000
2,9960.0,3.0,2.0,2.0,yes,no,yes,no,no,2.0,yes,semi-furnished,1225000
3,7500.0,4.0,2.0,2.0,yes,no,yes,no,yes,3.0,yes,furnished,1221500
4,7420.0,4.0,1.0,2.0,yes,yes,yes,no,yes,2.0,no,furnished,1141000


In [51]:
House.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   area              545 non-null    float64
 1   bedrooms          545 non-null    float64
 2   bathrooms         545 non-null    float64
 3   stories           545 non-null    float64
 4   mainroad          545 non-null    object 
 5   guestroom         545 non-null    object 
 6   basement          545 non-null    object 
 7   hotwaterheating   545 non-null    object 
 8   airconditioning   545 non-null    object 
 9   parking           545 non-null    float64
 10  prefarea          545 non-null    object 
 11  furnishingstatus  545 non-null    object 
 12  price             554 non-null    int64  
dtypes: float64(5), int64(1), object(7)
memory usage: 56.4+ KB


In [52]:
home_desc = House.select_dtypes(include=['int64', 'float64'])

In [53]:
home_desc.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [54]:
X = home_desc.iloc[:,:-1]

In [55]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,parking
0,7420.0,4.0,2.0,3.0,2.0
1,8960.0,4.0,4.0,4.0,3.0
2,9960.0,3.0,2.0,2.0,2.0
3,7500.0,4.0,2.0,2.0,3.0
4,7420.0,4.0,1.0,2.0,2.0
...,...,...,...,...,...
549,3000.0,2.0,1.0,1.0,2.0
550,2400.0,3.0,1.0,1.0,0.0
551,3620.0,2.0,1.0,1.0,0.0
552,2910.0,3.0,1.0,1.0,0.0


In [56]:
y = home_desc['price']

In [57]:
y

0      1330000
1      1225000
2      1225000
3      1221500
4      1141000
        ...   
549     182000
550     176715
551     175000
552     175000
553     175000
Name: price, Length: 545, dtype: int64

In [58]:
# Create a Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

In [59]:
# Specify the Number of Features to select
num_feature_to_select = 3

In [60]:
# Crate the RFE Model 
rfe_model = RFE(estimator=rf_regressor, n_features_to_select=num_feature_to_select)


In [61]:
# fit the RFE Model to the data 
rfe_model.fit(X,y)

RFE(estimator=RandomForestRegressor(random_state=42), n_features_to_select=3)

In [62]:
# Get the selected features
selected_features = np.arange(len(rfe_model.support_))[rfe_model.support_]

In [63]:
# Print the selected features
print("Selected Features:", selected_features)

Selected Features: [0 2 4]


### Apply Random Forest model 

In [64]:
# Selected Feature
X_selected = X.iloc[:, selected_features]

In [65]:
X_selected

Unnamed: 0,area,bathrooms,parking
0,7420.0,2.0,2.0
1,8960.0,4.0,3.0
2,9960.0,2.0,2.0
3,7500.0,2.0,3.0
4,7420.0,1.0,2.0
...,...,...,...
549,3000.0,1.0,2.0
550,2400.0,1.0,0.0
551,3620.0,1.0,0.0
552,2910.0,1.0,0.0


In [66]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [67]:
# Train the Random Forest model on the selected features
rf_regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [69]:
# Make Prediction on the test set 
y_pred = rf_regressor.predict(X_test)

In [72]:
# Evaluate the Model Performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [74]:
# Print the Evaluation metrics
print("RMSE:", rmse)
print("MAE:", mae )
print("R^2:", r2)


RMSE: 167289.49004654578
MAE: 123324.7771794515
R^2: 0.44632734107932337
