# Task 7: AutoFeatureSelector Tool
## This task is to test your understanding of various Feature Selection methods outlined in the lecture and the ability to apply this knowledge in a real-world dataset to select best features and also to build an automated feature selection tool as your toolkit

### Use your knowledge of different feature selector methods to build an Automatic Feature Selection tool
- Pearson Correlation
- Chi-Square
- RFE
- Embedded
- Tree (Random Forest)
- Tree (Light GBM)

### Dataset: FIFA 19 Player Skills
#### Attributes: FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

In [52]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [53]:
player_df = pd.read_csv("/content/fifa19.csv")

In [54]:
numcols = ['Overall', 'Crossing','Finishing',  'ShortPassing',  'Dribbling','LongPassing', 'BallControl', 'Acceleration','SprintSpeed', 'Agility',  'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength','LongShots','Aggression','Interceptions']
catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']

In [55]:
player_df = player_df[numcols+catcols]

In [56]:
traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
features = traindf.columns

traindf = traindf.dropna()

In [57]:
traindf = pd.DataFrame(traindf,columns=features)

In [58]:
y = traindf['Overall']>=87
X = traindf.copy()
del X['Overall']

In [59]:
X.head()

Unnamed: 0,Crossing,Finishing,ShortPassing,Dribbling,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Stamina,...,Nationality_Uganda,Nationality_Ukraine,Nationality_United Arab Emirates,Nationality_United States,Nationality_Uruguay,Nationality_Uzbekistan,Nationality_Venezuela,Nationality_Wales,Nationality_Zambia,Nationality_Zimbabwe
0,84.0,95.0,90.0,97.0,87.0,96.0,91.0,86.0,91.0,72.0,...,False,False,False,False,False,False,False,False,False,False
1,84.0,94.0,81.0,88.0,77.0,94.0,89.0,91.0,87.0,88.0,...,False,False,False,False,False,False,False,False,False,False
2,79.0,87.0,84.0,96.0,78.0,95.0,94.0,90.0,96.0,81.0,...,False,False,False,False,False,False,False,False,False,False
3,17.0,13.0,50.0,18.0,51.0,42.0,57.0,58.0,60.0,43.0,...,False,False,False,False,False,False,False,False,False,False
4,93.0,82.0,92.0,86.0,91.0,91.0,78.0,76.0,79.0,90.0,...,False,False,False,False,False,False,False,False,False,False


In [60]:
len(X.columns)

223

### Set some fixed set of features

In [61]:
feature_name = list(X.columns)
# no of maximum features we need to select
num_feats=30

## Filter Feature Selection - Pearson Correlation

### Pearson Correlation function

In [62]:
def cor_selector(X, y,num_feats):
    # Your code goes here (Multiple lines)

    cor_list = []
    # Calculate the correlation coefficient between each feature and the target variable
    for col in X.columns:
        cor = np.corrcoef(X[col], y)[0, 1]
        cor_list.append(cor)

    # Replace NaN values with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]

    # Get the indices of the top `num_feats` features
    cor_feature_indices = np.argsort(np.abs(cor_list))[-num_feats:]

    # Generate boolean support mask and feature names
    cor_support = [False] * len(cor_list)
    for idx in cor_feature_indices:
        cor_support[idx] = True
    cor_feature = X.columns[cor_support]

    # Your code ends here
    return cor_support, cor_feature

In [63]:
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

30 selected features


### List the selected features from Pearson Correlation

In [64]:
cor_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'ShotPower', 'Strength',
       'LongShots', 'Weak Foot', 'Position_LAM', 'Position_LF', 'Position_RF',
       'Body Type_C. Ronaldo', 'Body Type_Courtois', 'Body Type_Messi',
       'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25',
       'Nationality_Belgium', 'Nationality_Costa Rica', 'Nationality_Gabon',
       'Nationality_Slovenia', 'Nationality_Uruguay'],
      dtype='object')

## Filter Feature Selection - Chi-Sqaure

In [65]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

### Chi-Squared Selector function

In [66]:
def chi_squared_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # MinMaxScaler is required for chi2 to work correctly (non-negative values)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply the SelectKBest class with the chi2 score function
    chi_selector = SelectKBest(score_func=chi2, k=num_feats)
    chi_selector.fit(X_scaled, y)

    # Get boolean mask and selected features
    chi_support = chi_selector.get_support()
    chi_feature = X.columns[chi_support]
    # Your code ends here
    return chi_support, chi_feature

In [67]:
chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
print(str(len(chi_feature)), 'selected features')

30 selected features


### List the selected features from Chi-Square

In [68]:
chi_feature

Index(['Finishing', 'ShortPassing', 'LongPassing', 'BallControl', 'Volleys',
       'FKAccuracy', 'Reactions', 'LongShots', 'Position_CM', 'Position_LAM',
       'Position_LF', 'Position_LW', 'Position_RB', 'Position_RF',
       'Body Type_C. Ronaldo', 'Body Type_Courtois', 'Body Type_Messi',
       'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25',
       'Nationality_Belgium', 'Nationality_Costa Rica', 'Nationality_Croatia',
       'Nationality_Egypt', 'Nationality_England', 'Nationality_France',
       'Nationality_Gabon', 'Nationality_Slovakia', 'Nationality_Slovenia',
       'Nationality_Spain', 'Nationality_Uruguay'],
      dtype='object')

## Wrapper Feature Selection - Recursive Feature Elimination

In [69]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

### RFE Selector function

In [70]:
def rfe_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # Normalize the feature set (optional but helps convergence for Logistic Regression)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Create the logistic regression model
    model = LogisticRegression(max_iter=1000, solver='liblinear')

    # Apply Recursive Feature Elimination
    rfe = RFE(estimator=model, n_features_to_select=num_feats, step=1)
    rfe.fit(X_scaled, y)

    # Get the boolean mask and feature names
    rfe_support = rfe.get_support()
    rfe_feature = X.columns[rfe_support]
    # Your code ends here
    return rfe_support, rfe_feature

In [71]:
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
print(str(len(rfe_feature)), 'selected features')

30 selected features


### List the selected features from RFE

In [72]:
rfe_feature

Index(['Finishing', 'ShortPassing', 'LongPassing', 'BallControl', 'Volleys',
       'Reactions', 'Balance', 'Strength', 'Aggression', 'Preferred Foot_Left',
       'Preferred Foot_Right', 'Position_CAM', 'Position_CM', 'Position_GK',
       'Position_LCB', 'Position_LF', 'Position_LM', 'Position_RB',
       'Position_RM', 'Position_RW', 'Body Type_Lean', 'Body Type_Normal',
       'Body Type_Stocky', 'Nationality_Belgium', 'Nationality_Costa Rica',
       'Nationality_Croatia', 'Nationality_Gabon', 'Nationality_Netherlands',
       'Nationality_Slovenia', 'Nationality_Uruguay'],
      dtype='object')

## Embedded Selection - Lasso: SelectFromModel

In [73]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [74]:
def embedded_log_reg_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # Normalize the feature set (optional but helps convergence for Logistic Regression)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Create the logistic regression model
    model = LogisticRegression(max_iter=1000, solver='liblinear')
    model.fit(X_scaled, y)

    # Use SelectFromModel to select features based on their importance
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)

    # Get boolean mask and feature names
    embedded_lr_support = selector.get_support()
    embedded_lr_feature = X.columns[embedded_lr_support]
    # Your code ends here
    return embedded_lr_support, embedded_lr_feature

In [75]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
print(str(len(embedded_lr_feature)), 'selected features')

30 selected features


In [76]:
embedded_lr_feature

Index(['Finishing', 'ShortPassing', 'LongPassing', 'BallControl', 'Volleys',
       'FKAccuracy', 'Reactions', 'Balance', 'Aggression',
       'Preferred Foot_Left', 'Preferred Foot_Right', 'Position_CAM',
       'Position_CM', 'Position_GK', 'Position_LCB', 'Position_LF',
       'Position_LM', 'Position_RB', 'Position_RM', 'Position_RW',
       'Body Type_Lean', 'Body Type_Normal', 'Body Type_Stocky',
       'Nationality_Belgium', 'Nationality_Costa Rica', 'Nationality_Croatia',
       'Nationality_Gabon', 'Nationality_Netherlands', 'Nationality_Slovenia',
       'Nationality_Uruguay'],
      dtype='object')

## Tree based(Random Forest): SelectFromModel

In [77]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [78]:
def embedded_rf_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # Create the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    # Use SelectFromModel to select features based on importance
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)

    # Get boolean mask and feature names
    embedded_rf_support = selector.get_support()
    embedded_rf_feature = X.columns[embedded_rf_support]
    # Your code ends here
    return embedded_rf_support, embedded_rf_feature

In [79]:
embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
print(str(len(embedded_rf_feature)), 'selected features')

30 selected features


In [80]:
embedded_rf_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot',
       'Preferred Foot_Right', 'Position_RCB', 'Body Type_Courtois',
       'Body Type_Lean', 'Body Type_Normal', 'Nationality_Belgium',
       'Nationality_Brazil', 'Nationality_Italy', 'Nationality_Slovenia',
       'Nationality_Spain'],
      dtype='object')

## Tree based(Light GBM): SelectFromModel

In [81]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

In [82]:
def embedded_lgbm_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # Create the LightGBM model
    model = LGBMClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    # Use SelectFromModel to select features based on importance
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)

    # Get boolean mask and feature names
    embedded_lgbm_support = selector.get_support()
    embedded_lgbm_feature = X.columns[embedded_lgbm_support]
    # Your code ends here
    return embedded_lgbm_support, embedded_lgbm_feature

In [83]:
embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
print(str(len(embedded_lgbm_feature)), 'selected features')

[LightGBM] [Info] Number of positive: 55, number of negative: 18104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1812
[LightGBM] [Info] Number of data points in the train set: 18159, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003029 -> initscore=-5.796555
[LightGBM] [Info] Start training from score -5.796555
30 selected features


In [84]:
embedded_lgbm_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot',
       'Preferred Foot_Left', 'Position_LCB', 'Position_LM', 'Body Type_Lean',
       'Nationality_Belgium', 'Nationality_Finland', 'Nationality_France',
       'Nationality_Italy', 'Nationality_Senegal', 'Nationality_Slovenia'],
      dtype='object')

## Putting all of it together: AutoFeatureSelector Tool

In [85]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embedded_lr_support,
                                    'Random Forest':embedded_rf_support, 'LightGBM':embedded_lgbm_support})
# count the selected times for each feature
#feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df['Total'] = feature_selection_df.iloc[:, 1:].sum(axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,Volleys,True,True,True,True,True,True,6
2,ShortPassing,True,True,True,True,True,True,6
3,Reactions,True,True,True,True,True,True,6
4,Nationality_Slovenia,True,True,True,True,True,True,6
5,Nationality_Belgium,True,True,True,True,True,True,6
6,LongPassing,True,True,True,True,True,True,6
7,Finishing,True,True,True,True,True,True,6
8,BallControl,True,True,True,True,True,True,6
9,FKAccuracy,True,True,False,True,True,True,5
10,Strength,True,False,True,False,True,True,4


## Can you build a Python script that takes dataset and a list of different feature selection methods that you want to try and output the best (maximum votes) features from all methods?

In [86]:
def preprocess_dataset(dataset_path):
    # Your code starts here (Multiple lines)

    player_df = pd.read_csv(dataset_path)
    numcols = ['Overall', 'Crossing', 'Finishing', 'ShortPassing', 'Dribbling',
               'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
               'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
               'Strength', 'LongShots', 'Aggression', 'Interceptions']
    catcols = ['Preferred Foot', 'Position', 'Body Type', 'Nationality', 'Weak Foot']

    # Select relevant columns
    player_df = player_df[numcols + catcols]
    player_df = player_df.dropna()

    # Convert categorical columns to dummies
    traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])], axis=1)
    features = traindf.columns

    # Set target variable and features
    y = traindf['Overall'] >= 87  # Binary classification target
    X = traindf.copy()
    del X['Overall']
    num_feats = 30  # Maximum features to select

    # Your code ends here
    return X, y, num_feats

In [87]:
def autoFeatureSelector(dataset_path, methods=[]):
    # Parameters
    # data - dataset to be analyzed (csv file)
    # methods - various feature selection methods we outlined before, use them all here (list)

    # preprocessing
    X, y, num_feats = preprocess_dataset(dataset_path)
    feature_selection_results = {}

    # Run every method we outlined above from the methods list and collect returned best features from every method
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y,num_feats)
        feature_selection_results['pearson'] = cor_support
    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
        feature_selection_results['chi-square'] = chi_support
    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
        feature_selection_results['rfe'] = rfe_support
    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
        feature_selection_results['log-reg'] = embedded_lr_support
    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
        feature_selection_results['rf'] = embedded_rf_support
    if 'lgbm' in methods:
        embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
        feature_selection_results['lgbm'] = embedded_lgbm_support


    # Combine all the above feature list and count the maximum set of features that got selected by all methods
    #### Your Code starts here (Multiple lines)

    all_support = pd.DataFrame(feature_selection_results, index=X.columns)
    all_support['Total'] = all_support.sum(axis=1)  # Count votes for each feature
    # Sort by total votes
    sorted_support = all_support.sort_values('Total', ascending=False)
    best_features = sorted_support.head(num_feats).index.tolist()
    sorted_support_styled = sorted_support.head(num_feats).style.set_caption("Best Features Selected").background_gradient(cmap="Blues").format({"Total": "{:.0f}"})
    display(sorted_support_styled)

    #### Your Code ends here
    return best_features

In [88]:
best_features = autoFeatureSelector(dataset_path="/content/fifa19.csv", methods=['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm'])
best_features

[LightGBM] [Info] Number of positive: 55, number of negative: 18092
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1812
[LightGBM] [Info] Number of data points in the train set: 18147, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003031 -> initscore=-5.795892
[LightGBM] [Info] Start training from score -5.795892


Unnamed: 0,pearson,chi-square,rfe,log-reg,rf,lgbm,Total
Reactions,True,True,True,True,True,True,6
LongPassing,True,True,True,True,True,True,6
BallControl,True,True,True,True,True,True,6
Volleys,True,True,True,True,True,True,6
Finishing,True,True,True,True,True,True,6
ShortPassing,True,True,True,True,True,True,6
Nationality_Belgium,True,True,True,True,False,True,5
Nationality_Slovenia,True,True,True,True,False,True,5
FKAccuracy,True,True,False,True,True,True,5
Nationality_Gabon,True,True,True,True,False,False,4


['Reactions',
 'LongPassing',
 'BallControl',
 'Volleys',
 'Finishing',
 'ShortPassing',
 'Nationality_Belgium',
 'Nationality_Slovenia',
 'FKAccuracy',
 'Nationality_Gabon',
 'Nationality_Uruguay',
 'Nationality_Costa Rica',
 'Position_LF',
 'Balance',
 'Body Type_Normal',
 'Strength',
 'LongShots',
 'Aggression',
 'Body Type_Lean',
 'Position_RB',
 'Crossing',
 'Body Type_Courtois',
 'Position_CM',
 'Body Type_PLAYER_BODY_TYPE_25',
 'Position_LCB',
 'Nationality_Croatia',
 'Acceleration',
 'SprintSpeed',
 'Dribbling',
 'Weak Foot']

### Last, Can you turn this notebook into a python script, run it and submit the python (.py) file that takes dataset and list of methods as inputs and outputs the best features

In [89]:
#python feature_selector.py --dataset fifa19.csv --methods pearson chi-square rfe log-reg rf lgbm


script_content = """
import numpy as np
import pandas as pd
import warnings
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore")


# Preprocess the dataset
def preprocess_dataset(dataset_path):
    player_df = pd.read_csv(dataset_path)
    numcols = ['Overall', 'Crossing', 'Finishing', 'ShortPassing', 'Dribbling',
               'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
               'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
               'Strength', 'LongShots', 'Aggression', 'Interceptions']
    catcols = ['Preferred Foot', 'Position', 'Body Type', 'Nationality', 'Weak Foot']

    player_df = player_df[numcols + catcols].dropna()
    traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])], axis=1)

    y = traindf['Overall'] >= 87
    X = traindf.drop(columns=['Overall'])
    num_feats = 30

    return X, y, num_feats


# Feature selection methods
def cor_selector(X, y, num_feats):
    cor_list = [np.corrcoef(X[col], y)[0, 1] if not np.isnan(np.corrcoef(X[col], y)[0, 1]) else 0 for col in X.columns]
    cor_support = [False] * len(cor_list)
    for idx in np.argsort(np.abs(cor_list))[-num_feats:]:
        cor_support[idx] = True
    return cor_support, X.columns[cor_support]


def chi_squared_selector(X, y, num_feats):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    chi_selector = SelectKBest(score_func=chi2, k=num_feats).fit(X_scaled, y)
    return chi_selector.get_support(), X.columns[chi_selector.get_support()]


def rfe_selector(X, y, num_feats):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    model = LogisticRegression(max_iter=1000, solver='liblinear')
    rfe = RFE(estimator=model, n_features_to_select=num_feats, step=1).fit(X_scaled, y)
    return rfe.get_support(), X.columns[rfe.get_support()]


def embedded_log_reg_selector(X, y, num_feats):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    model = LogisticRegression(max_iter=1000, solver='liblinear').fit(X_scaled, y)
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)
    return selector.get_support(), X.columns[selector.get_support()]


def embedded_rf_selector(X, y, num_feats):
    model = RandomForestClassifier(n_estimators=100, random_state=42).fit(X, y)
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)
    return selector.get_support(), X.columns[selector.get_support()]


def embedded_lgbm_selector(X, y, num_feats):
    model = LGBMClassifier(n_estimators=100, random_state=42).fit(X, y)
    selector = SelectFromModel(estimator=model, max_features=num_feats, threshold=-np.inf, prefit=True)
    return selector.get_support(), X.columns[selector.get_support()]


# Main function
def autoFeatureSelector(dataset_path, methods=[]):
    X, y, num_feats = preprocess_dataset(dataset_path)

    feature_selection_results = {}

    if 'pearson' in methods:
        cor_support, _ = cor_selector(X, y, num_feats)
        feature_selection_results['pearson'] = cor_support
    if 'chi-square' in methods:
        chi_support, _ = chi_squared_selector(X, y, num_feats)
        feature_selection_results['chi-square'] = chi_support
    if 'rfe' in methods:
        rfe_support, _ = rfe_selector(X, y, num_feats)
        feature_selection_results['rfe'] = rfe_support
    if 'log-reg' in methods:
        lr_support, _ = embedded_log_reg_selector(X, y, num_feats)
        feature_selection_results['log-reg'] = lr_support
    if 'rf' in methods:
        rf_support, _ = embedded_rf_selector(X, y, num_feats)
        feature_selection_results['rf'] = rf_support
    if 'lgbm' in methods:
        lgbm_support, _ = embedded_lgbm_selector(X, y, num_feats)
        feature_selection_results['lgbm'] = lgbm_support

    all_support = pd.DataFrame(feature_selection_results, index=X.columns)
    all_support['Total'] = all_support.sum(axis=1)
    sorted_support = all_support.sort_values('Total', ascending=False)
    best_features = sorted_support.head(num_feats).index.tolist()
    sorted_support_styled = sorted_support.head(num_feats).style.set_caption("Best Features Selected").background_gradient(cmap="Blues").format({"Total": "{:.0f}"})
    display(sorted_support_styled)

    return best_features


# Run script
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Feature Selection Tool")
    parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset CSV file")
    parser.add_argument("--methods", nargs='+', required=True, help="List of feature selection methods to use")

    args = parser.parse_args()
    dataset_path = args.dataset
    methods = args.methods

    best_features = autoFeatureSelector(dataset_path, methods)
    print("Best features selected:", best_features)
"""

# Save the script content to a Python file
with open("feature_selector.py", "w") as file:
    file.write(script_content)

print("Python script 'feature_selector.py' has been created.")


Python script 'feature_selector.py' has been created.
