<a href="https://colab.research.google.com/github/KamiraPagulayan/CCADMACL_EXERCISES_COM222ML/blob/main/EXERCISE%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 1

Use all feature selection methods to find the best features

## Dataset Information

## Features

Number of Instances: 20640

Number of Attributes: 8 numeric, predictive attributes and the target

Attribute Information:

MedInc - median income in block group

HouseAge - median house age in block group

AveRooms - average number of rooms per household

AveBedrms - average number of bedrooms per household

Population - block group population

AveOccup - average number of household members

Latitude - block group latitude

Longitude - block group longitude

## Target
The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

In [176]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [177]:
housing = fetch_california_housing(as_frame=True)
df = pd.concat([housing.data, housing.target], axis=1)

In [178]:
df_housing_data =  pd.DataFrame(housing.data, columns=housing.feature_names)
df_housing_target = pd.DataFrame(housing.target, columns=['MedHouseVal'])

In [179]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_housing_data = pd.DataFrame(scaler.fit_transform(df_housing_data), columns=df_housing_data.columns)

In [180]:
df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [181]:
df.corr()['MedHouseVal'].abs().sort_values(ascending=False)

Unnamed: 0,MedHouseVal
MedHouseVal,1.0
MedInc,0.688075
AveRooms,0.151948
Latitude,0.14416
HouseAge,0.105623
AveBedrms,0.046701
Longitude,0.045967
Population,0.02465
AveOccup,0.023737


1. Use any filter method to select the best features

In [182]:
from sklearn.feature_selection import f_regression

In [183]:
threshold = 5
high_score_features = []
feature_scores = f_regression(df_housing_data, df_housing_target.values.ravel())[0]

In [184]:
for score, f_name in sorted(zip(feature_scores, df_housing_data.columns), reverse=True)[:threshold]:
      high_score_features.append(f_name)

df_housing_fr = df_housing_data[high_score_features]
df_housing_fr.columns.tolist()

['MedInc', 'AveRooms', 'Latitude', 'HouseAge', 'AveBedrms']

2. Use any wrapper method to select the best features

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
selector = RFE(model_rf, n_features_to_select=4, step=1)

selector = selector.fit(df_housing_data, df_housing_target.values.ravel())
selector_ind = selector.get_support()
df_housing_rfe = df_housing_data.iloc[:, selector_ind]
df_housing_rfe.columns

3. Use any embedded methood to select the best features

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_housing_data, df_housing_target, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_selection import SelectFromModel

model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_rf.fit(X_train, y_train.values.ravel())

model_rf.predict(X_test)

sel_sfm = SelectFromModel(model_rf, prefit=True)
sel_sfm_index = sel_sfm.get_support()
df_housing_sfm = df_housing_data.iloc[:, sel_sfm_index]
df_housing_sfm.columns

### Model Comparisons based on Different Features Selected
Model used: Random Forest Regressor

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
model = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model.fit(X_train, y_train)

In [None]:
X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(df_housing_fr, df_housing_target, test_size=0.2, random_state=42)

In [None]:
model_fr = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_fr.fit(X_train_fr, y_train_fr)

In [None]:
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(df_housing_rfe, df_housing_target, test_size=0.2, random_state=42)

In [None]:
model_rfe = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_rfe.fit(X_train_rfe, y_train_rfe)

In [None]:
default_preds = model.predict(X_test)
fr_preds = model_fr.predict(X_test_fr)
rfe_preds = model_rfe.predict(X_test_rfe)
sfm_preds = model_rf.predict(X_test)

In [None]:
default_rmse = mean_squared_error(y_test, default_preds, squared=False)
fr_rmse = mean_squared_error(y_test, fr_preds, squared=False)
rfe_rmse = mean_squared_error(y_test, rfe_preds, squared=False)
sfm_rmse = mean_squared_error(y_test, sfm_preds, squared=False)

In [None]:
print(f'Default RMSE: {default_rmse}')
print(f'Filter RMSE: {fr_rmse}')
print(f'Wrapper RMSE: {rfe_rmse}')
print(f'Embedded RMSE: {sfm_rmse}')