# Feature Selection

### Recursive Feature Elimination (递归特征消除)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

def rfe_feature_selection(df: pd.DataFrame, target: pd.Series, n_features_to_select: int = 40) -> pd.DataFrame:
    """
    Use Recursive Feature Elimination (RFE) for dimensionality reduction of the data.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the features.
    target (pd.Series): The target variable (y).
    n_features_to_select (int): The number of features to select.

    Returns:
    pd.DataFrame: The DataFrame containing the selected features.
    """
    # Initialize the linear regression model
    model = LinearRegression()

    # Initialize RFE
    rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)

    # Fit the RFE model
    rfe.fit(df, target)

    # Select the features that were chosen
    selected_features = df.columns[rfe.support_]

    # Return the DataFrame containing the selected features
    selected_features_df = df[selected_features]

    return selected_features_df

# Example usage:
# selected_features_df = rfe_feature_selection(df_short_x_scaled, df_short_y_scaled, n_features_to_select=40)
# list(selected_features_df)

### Mutual Information (互信息法)

In [None]:
from sklearn.feature_selection import mutual_info_regression
import pandas as pd

def mutual_info_feature_selection(df: pd.DataFrame, target: pd.Series, n_features_to_select: int = 40) -> pd.DataFrame:
    """
    Select the most important features using mutual information (for regression tasks).

    Parameters:
    df (pd.DataFrame): The DataFrame containing the features.
    target (pd.Series): The target variable (y).
    n_features_to_select (int): The number of features to select.

    Returns:
    pd.DataFrame: The DataFrame containing the selected features.
    """
    # Calculate mutual information scores
    mi_scores = mutual_info_regression(df, target)

    # Convert scores to a Series and sort them
    mi_scores = pd.Series(mi_scores, index=df.columns)
    mi_scores_sorted = mi_scores.sort_values(ascending=False)

    # Select the top n_features_to_select features
    selected_features = mi_scores_sorted.head(n_features_to_select).index

    # Return the DataFrame containing the selected features
    selected_features_df = df[selected_features]

    return selected_features_df

# Example usage:
# selected_features_df = mutual_info_feature_selection(df_short_x_scaled, df_short_y_scaled, n_features_to_select=40)
# list(selected_features_df)

### Dropping Based On Correlation Coefficient

根据相关系数剔除

In [None]:
import pandas as pd

def analyze_high_correlation(df: pd.DataFrame, threshold: float = 0.9) -> pd.DataFrame:
    """
    Analyze the correlation coefficients between each variable in the DataFrame and identify the number of variables with correlation coefficients higher than the threshold.

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The threshold for the correlation coefficient.

    Returns:
    pd.DataFrame: A DataFrame containing the count of variables with correlation coefficients higher than the threshold for each variable.
    """
    # Calculate the correlation matrix and take the absolute value
    corr_matrix = df.corr().abs()

    # Initialize a Series to store the count of variables with correlation coefficients greater than the threshold for each variable
    count_high_corr = pd.Series(0, index=df.columns)

    # Iterate over the correlation matrix to count the number of variables with correlation coefficients greater than the threshold for each variable
    for col in df.columns:
        # Exclude the diagonal values which are always 1 (self-correlation)
        count_high_corr[col] = (corr_matrix[col] > threshold).sum() - 1

    # Create a DataFrame to store the results
    high_corr_df = pd.DataFrame({
        'Variable': count_high_corr.index,
        'Count_High_Corr': count_high_corr
    }).sort_values('Count_High_Corr', ascending=False)

    # Return the resulting DataFrame
    return high_corr_df

In [None]:
import pandas as pd

def drop_highly_correlated_features(df: pd.DataFrame, target_feature: str, threshold: float = 0.9) -> pd.DataFrame:
    """
    Drop features that have a correlation coefficient higher than the threshold with the target feature.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    target_feature (str): The name of the target feature. Other variables will be dropped based on their correlation with this feature.
    threshold (float): The threshold for the correlation coefficient. Variables with a correlation coefficient higher than this threshold will be dropped.
    """
    # Calculate the correlation coefficient with the target feature and drop the target feature itself
    corr_with_target = df.corrwith(df[target_feature]).drop(target_feature)

    # Find variables with a correlation coefficient higher than the threshold
    high_corr_vars = corr_with_target[corr_with_target > threshold].index.tolist()

    # Drop these variables
    df = df.drop(columns=high_corr_vars)

    # Display the list of dropped variables
    print(f"Variables with a correlation coefficient higher than {threshold} with '{target_feature}' have been dropped:", high_corr_vars)
    
    return df