# Feature_selection

## Abalation Approach (backword selection)

In [26]:
import warnings
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SequentialFeatureSelector

In [35]:
rental_df_preprocessed_train = pd.read_csv('../data/curated/rental_df_numericDescription_train.csv')
rental_df_preprocessed_test = pd.read_csv('../data/curated/rental_df_numericDescription_test.csv')

In [39]:
rental_df_preprocessed_train = rental_df_preprocessed_train.drop(columns='Unnamed: 0')
rental_df_preprocessed_test = rental_df_preprocessed_test.drop(columns='Unnamed: 0')

grid search on number of useful features, ablation approach drop one feature that has worest performance in model each time.

In [19]:
# Use the whole dataset (rental_df_numericDescription)
X_train = rental_df_preprocessed_train.drop(columns=['id', 'rent'])  # Features
y_train = rental_df_preprocessed_train['rent']  # Target

# Initialize the KNN model
knn = KNeighborsRegressor()

# Set up Sequential Feature Selector (SFS) with backward selection
sfs_knn = SequentialFeatureSelector(knn, n_features_to_select=30, direction='backward', cv=5)

# Set up the pipeline (Scaler, SFS, KNN)
pipeline_knn = Pipeline([
    ('scaler', StandardScaler()),        # Standardize features
    ('sfs', sfs_knn),                    # Sequential Feature Selection (backward)
    ('knn', knn)                         # KNN model
])

# Define the grid for hyperparameter tuning (number of neighbors)
param_grid = {
    'knn__n_neighbors': [10, 20, 30]  # Range of neighbors to test
}

# Perform GridSearchCV using the entire dataset
grid_search = GridSearchCV(pipeline_knn, param_grid, cv=5)

# Suppress runtime warnings and fit the model
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)  # Handle any runtime warnings during fitting
    grid_search.fit(X_train, y_train)

# Output the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# Retrieve the selected features based on the best model
best_model = grid_search.best_estimator_
selected_features = X_train.columns[best_model.named_steps['sfs'].get_support()]
print("Selected Features: ", selected_features)

# Retrieve the best model's performance score
print("Best model score: ", grid_search.best_score_)

Best hyperparameters:  {'knn__n_neighbors': 30}
Selected Features:  Index(['suburb_encoded', 'bedroom', 'bathroom', 'parking',
       'propertyType_Apartment', 'propertyType_Block of Units',
       'propertyType_Duplex', 'propertyType_House',
       'propertyType_Semi-Detached', 'propertyType_Terrace',
       'propertyType_Townhouse', 'propertyType_Villa', 'under 20', '60+',
       'Owner', 'Renter', 'Family', 'Single', 'propertyFeatures_encoded',
       'doc2vec_embedding_1', 'doc2vec_embedding_2', 'doc2vec_embedding_3',
       'doc2vec_embedding_4', 'doc2vec_embedding_5', 'doc2vec_embedding_6',
       'doc2vec_embedding_7', 'doc2vec_embedding_9', 'educationIndex',
       'transportIndex', 'lifeIndex'],
      dtype='object')
Best model score:  0.2733114289660848


In [52]:
corrected_selected_features = [feature.replace('rent', '') for feature in selected_features]

# Now, ensure 'rent' is kept in both train and test datasets
rental_df_selected_train = rental_df_preprocessed_train[['rent'] + corrected_selected_features]
rental_df_selected_test = rental_df_preprocessed_test[['rent'] + corrected_selected_features]

# Save the datasets with 'rent' column included to CSV files
rental_df_selected_train.to_csv('../data/rental_df_selected_train.csv', index=False)
rental_df_selected_test.to_csv('../data/rental_df_selected_test.csv', index=False)