In [1]:
import pandas as pd
import numpy as np
from feat import MathematicalTransformationFeature
import feat

In [2]:
df = pd.DataFrame({'one': [1,2,3,10], 'two': [2,3,4,5], 'class': [1,1,1,0], 'label': [0,0,0,1]})

In [3]:
df

Unnamed: 0,one,two,class,label
0,1,2,1,0
1,2,3,1,0
2,3,4,1,0
3,10,5,0,1


In [4]:
def test_generate():
    df = pd.DataFrame({'one': [1, 2, 3, 10], 'two': [2, 3, 4, 5], 'class': [1, 1, 1, 0], 'label': [0, 0, 0, 1]})
    pipeline = feat.AutomaticFeatureEngineeringPipeline(target_col='one')
    features = [
        {'name': 'log_feature', 'transformation_type': 'logarithmic'},
        {'name': 'sqrt_feature', 'transformation_type': 'square_root'},
        {'name': 'exp_feature', 'transformation_type': 'exponential', 'power': 2},
        {'name': 'reciprocal_feature', 'transformation_type': 'reciprocal'},
        {'name': 'boxcox_feature', 'transformation_type': 'box_cox'},
        {'name': 'power_feature', 'transformation_type': 'power'},
        {'name': 'binning_feature', 'transformation_type': 'binning', 'num_bins': 5},
        {'name': 'standardization_feature', 'transformation_type': 'standardization'},
        {'name': 'rank_feature', 'transformation_type': 'rank'},
        {'name': 'difference_feature', 'transformation_type': 'difference', 'other_feature': 'two'}
    ]

    for feature_params in features:
        feature = MathematicalTransformationFeature(**feature_params)
        pipeline.add_feature(feature)
        print(f"Added feature: {feature_params['name']}")

    processed_data = pipeline.generate_features(data=df)
    print("Processed data columns:")
    print(processed_data.columns)

    return processed_data

x = test_generate()



Added feature: log_feature
Added feature: sqrt_feature
Added feature: exp_feature
Added feature: reciprocal_feature
Added feature: boxcox_feature
Added feature: power_feature
Added feature: binning_feature
Added feature: standardization_feature
Added feature: rank_feature
Added feature: difference_feature
Processed data columns:
Index(['one', 'two', 'class', 'label', 'log_feature', 'sqrt_feature',
       'exp_feature', 'reciprocal_feature', 'boxcox_feature', 'power_feature',
       'binning_feature', 'standardization_feature', 'rank_feature',
       'difference_feature'],
      dtype='object')


In [5]:
x

Unnamed: 0,one,two,class,label,log_feature,sqrt_feature,exp_feature,reciprocal_feature,boxcox_feature,power_feature,binning_feature,standardization_feature,rank_feature,difference_feature
0,1,2,1,0,0.693147,1.0,1,1.0,0.0,1,0,-0.734847,1.0,-1
1,2,3,1,0,1.098612,1.414214,4,0.5,0.622944,4,0,-0.489898,2.0,-1
2,3,4,1,0,1.386294,1.732051,9,0.333333,0.929249,9,1,-0.244949,3.0,-1
3,10,5,0,1,2.397895,3.162278,100,0.1,1.639537,100,4,1.469694,4.0,5


In [6]:
import feat
def test_missing_value_imputation_feature():
    df = pd.DataFrame({'one': [1, np.nan, 3, 10],
                       'two': [2, 2, np.nan, 5],
                       'class': [1, 1, 0, 0],
                       'label': [0, 0, 1, 1]})

    pipeline = feat.AutomaticFeatureEngineeringPipeline(target_col='one')

    features = [
        {'name': 'one_mean', 'imputation_strategy': 'mean'},
        {'name': 'two_median', 'imputation_strategy': 'median'},
        {'name': 'two_mode', 'imputation_strategy': 'mode'},
        {'name': 'one_constant', 'imputation_strategy': 'constant'},
        {'name': 'two_forward_fill', 'imputation_strategy': 'forward_fill'},
        {'name': 'two_backward_fill', 'imputation_strategy': 'backward_fill'},
        {'name': 'two_interpolation', 'imputation_strategy': 'interpolation'},
        {'name': 'one_knn', 'imputation_strategy': 'knn', 'n_neighbors': 5},
        {'name': 'one_multiple', 'imputation_strategy': 'multiple'},
        {'name': 'one_missing_indicator', 'imputation_strategy': 'missing_indicator'},
        #{'name': 'one_custom', 'imputation_strategy': 'custom'}
    ]

    for feature_params in features:
        feature = feat.MissingValueImputationFeature(**feature_params)
        pipeline.add_feature(feature)
        print(f"Added feature: {feature_params['name']}")

    transformed_data = pipeline.generate_features(data=df)
    print("Transformed data columns:")
    print(transformed_data.columns)

    return transformed_data

x = test_missing_value_imputation_feature()

Added feature: one_mean
Added feature: two_median
Added feature: two_mode
Added feature: one_constant
Added feature: two_forward_fill
Added feature: two_backward_fill
Added feature: two_interpolation
Added feature: one_knn
Added feature: one_multiple
Added feature: one_missing_indicator
Transformed data columns:
Index(['one', 'two', 'class', 'label', 'one_mean', 'two_median', 'two_mode',
       'one_constant', 'two_forward_fill', 'two_backward_fill',
       'two_interpolation', 'one_knn', 'one_multiple',
       'one_missing_indicator'],
      dtype='object')


In [7]:
x

Unnamed: 0,one,two,class,label,one_mean,two_median,two_mode,one_constant,two_forward_fill,two_backward_fill,two_interpolation,one_knn,one_multiple,one_missing_indicator
0,1.0,2.0,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1,,2.0,1,0,4.666667,3.0,1.0,0.0,1.0,3.0,2.0,4.666667,4.666667,1
2,3.0,,0,1,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0
3,10.0,5.0,0,1,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0


In [8]:
def test_knn_imputation():
    df = pd.DataFrame({'one': [1, np.nan, 3, 10],
                       'two': [2, 2, np.nan, 5],
                       'class': [1, 1, 0, 0],
                       'label': [0, 0, 1, 1]})

    pipeline = feat.AutomaticFeatureEngineeringPipeline(target_col='one')

    feature_params = {'name': 'one_knn', 'imputation_strategy': 'knn', 'n_neighbors': 2}
    feature = feat.MissingValueImputationFeature(**feature_params)
    pipeline.add_feature(feature)
    print(f"Added feature: {feature_params['name']}")

    transformed_data = pipeline.generate_features(data=df)

    return transformed_data

x = test_knn_imputation()

Added feature: one_knn


In [9]:
x

Unnamed: 0,one,two,class,label,one_knn
0,1.0,2.0,1,0,1.0
1,,2.0,1,0,2.0
2,3.0,,0,1,3.0
3,10.0,5.0,0,1,10.0


In [10]:
df = pd.DataFrame({'one': [1, 2, 3, 10], 'two': [2, np.nan, 4, 5], 'class': [1, 1, 1, 0], 'label': [0, 0, 0, 1]})

pipeline = feat.AutomaticFeatureEngineeringPipeline(target_col='one')
pipeline.add_feature(feat.MathematicalTransformationFeature(name='squared_feature', transformation_type='power', power=2))
pipeline.add_feature(feat.MissingValueImputationFeature(name='two', imputation_strategy='mean'))


processed_data = pipeline.generate_features(data=df)

In [11]:
processed_data

Unnamed: 0,one,two,class,label,squared_feature
0,1,1,1,0,1
1,2,2,1,0,4
2,3,3,1,0,9
3,10,10,0,1,100


In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import feat

df = pd.DataFrame({'feature1': [1, 2, 3, 4],
                   'feature2': [5, 6, 7, 8],
                   'target': [0, 1, 0, 1]})
# Split the data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of AutomaticFeatureEngineeringPipeline
pipeline = feat.AutomaticFeatureEngineeringPipeline(target_col='feature1')

# Add features to the pipeline
pipeline.add_feature(feat.MissingValueImputationFeature(name='feature1', imputation_strategy='mean'))
pipeline.add_feature(feat.MathematicalTransformationFeature(name='squared', transformation_type='power', power=2))
pipeline.add_feature(feat.InteractionFeature(method = 'add', columns=['feature1', 'feature2', 'squared']))
pipeline.set_feature_selector(feat.FeatureSelection(method='univariate', k=3), not_X_col=[], y_col=['target'])

#pipeline.set_feature_selector(feat.FeatureSelection(method='pearson_correlation', correlation_threshold=0.2, X=df[['feature1','feature2']], y=df['target']))



# Generate features on the dataset
processed_data = pipeline.generate_features(data=df)

# Print the processed data
print(processed_data)

['feature1', 'feature2', 'add_feature1_feature2']
   feature1  feature2  add_feature1_feature2
0         1         5                      6
1         2         6                      8
2         3         7                     10
3         4         8                     12


  y = column_or_1d(y, warn=True)


In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from feat import FeatureSelection

# Load the breast cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target


# Create a pandas DataFrame
df = pd.concat([X, pd.Series(y, name="target")], axis=1)

# Instantiate the FeatureSelection class
feature_selector = FeatureSelection(method="lasso",n_features_to_select=2)

# Perform feature selection on the dataset
selected_features = feature_selector.fit_transform(df.drop("target", axis=1), df["target"])

# Print the selected features
print("Selected Features:")
print(selected_features.columns.tolist())

Selected Features:
['worst radius', 'worst area']


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Number of features to select
n_features_to_select = 5

# k for univariate_selection
k = 5

# percentile for univariate_selection
percentile = 50

# correlation_threshold for pearson_correlation and spearman_correlation
correlation_threshold = 0.65

# Instantiate the FeatureSelection class for each method
methods = ['univariate', 'rfe', 'lasso', 'random_forest', 'pearson_correlation', 'spearman_correlation', 'box_cox']
for method in methods:
    if method == 'univariate':
        feature_selector = feat.FeatureSelection(method=method, k=k)
    elif method in ['pearson_correlation', 'spearman_correlation']:
        feature_selector = feat.FeatureSelection(method=method, correlation_threshold=correlation_threshold)
    else:
        feature_selector = feat.FeatureSelection(method=method, n_features_to_select=n_features_to_select)
    
    # Perform feature selection on the dataset
    selected_features = feature_selector.fit_transform(X, y)
    
    # Check the number of selected features
    print(f"Method: {method}")
    print(f"Expected number of selected features: {n_features_to_select if method != 'univariate' else k}")
    print(f"Actual number of selected features: {selected_features.shape[1]}")
    print("---")


NameError: name 'feat' is not defined

In [None]:
def multiply(x):
    x_ = pd.DataFrame()
    for name in x.columns.tolist():
        column_names = x.columns.tolist()
        column_names.remove(name)
        for oth_name in column_names:
            new_col_name = "{} {} {}".format(name, '*', oth_name)
            x_[new_col_name] = x[name] * x[oth_name]
    return x_