In [100]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_theme(style='whitegrid')

In [101]:
df = pd.read_csv(r'DataPreprocessingGraded_dataset.csv')

In [102]:
df.shape

(748, 6)

In [103]:
new_df = df.iloc[:,:-1]
new_df

Unnamed: 0,V1,V2,V3,V4,V5
0,2.0,50.0,12500.0,98.0,NEGATIVE
1,0.0,13.0,3250.0,28.0,NEGATIVE
2,?,?,4000.0,35.0,NEGATIVE
3,?,20.0,5000.0,45.0,NEGATIVE
4,1.0,24.0,6000.0,77.0,NEGATIVE
...,...,...,...,...,...
743,23.0,2.0,500.0,38.0,NEGATIVE
744,21.0,2.0,500.0,52.0,NEGATIVE
745,23.0,3.0,750.0,62.0,NEGATIVE
746,39.0,1.0,250.0,39.0,NEGATIVE


In [104]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, LabelBinarizer, OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [105]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      748 non-null    object 
 1   V2      748 non-null    object 
 2   V3      748 non-null    float64
 3   V4      748 non-null    float64
 4   V5      748 non-null    object 
dtypes: float64(2), object(3)
memory usage: 29.3+ KB


In [106]:
new_df = new_df.replace(to_replace='?',value=np.nan)

In [107]:
new_df['V1'] = new_df['V1'].astype('float')
new_df['V2'] = new_df['V2'].astype('float')

In [108]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      743 non-null    float64
 1   V2      743 non-null    float64
 2   V3      748 non-null    float64
 3   V4      748 non-null    float64
 4   V5      748 non-null    object 
dtypes: float64(4), object(1)
memory usage: 29.3+ KB


In [109]:
new_df

Unnamed: 0,V1,V2,V3,V4,V5
0,2.0,50.0,12500.0,98.0,NEGATIVE
1,0.0,13.0,3250.0,28.0,NEGATIVE
2,,,4000.0,35.0,NEGATIVE
3,,20.0,5000.0,45.0,NEGATIVE
4,1.0,24.0,6000.0,77.0,NEGATIVE
...,...,...,...,...,...
743,23.0,2.0,500.0,38.0,NEGATIVE
744,21.0,2.0,500.0,52.0,NEGATIVE
745,23.0,3.0,750.0,62.0,NEGATIVE
746,39.0,1.0,250.0,39.0,NEGATIVE


In [110]:
# Read the data
df = pd.read_csv(r'DataPreprocessingGraded_dataset.csv')

# Define the numeric pipeline
num_pipeline = Pipeline(steps=[
    ('selector', ColumnTransformer(
        transformers=[
            ('select_first_4', 'passthrough', slice(0, 4))
        ],
        remainder='drop'
    )),
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

# Define the categorical pipeline (assuming column index 4 is categorical)
cat_pipeline = ColumnTransformer(
    transformers=[
        ('encode', OrdinalEncoder(), [4])
    ]
)

# Combine both pipelines using FeatureUnion
cmp_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

# Feature selection step with VarianceThreshold
vt = VarianceThreshold(threshold=0.1)

# Complete pipeline
pipe = Pipeline([
    ('feature_union', cmp_pipeline),
])

# Fit the pipeline on the entire DataFrame (unsupervised case)
pipe.fit(new_df)

# Alternatively, if there's a target column:
# X = df.drop('target', axis=1)
# y = df['target']
# pipe.fit(X, y)


In [111]:
transfored_df = pipe.transform(new_df)

In [112]:
# Read the data
df = pd.read_csv(r'DataPreprocessingGraded_dataset.csv')

# Define the numeric pipeline
num_pipeline = Pipeline(steps=[
    ('selector', ColumnTransformer(
        transformers=[
            ('select_first_4', 'passthrough', slice(0, 4))
        ],
        remainder='drop'
    )),
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

# Define the categorical pipeline (assuming column index 4 is categorical)
cat_pipeline = ColumnTransformer(
    transformers=[
        ('encode', OrdinalEncoder(), [4])
    ]
)

# Combine both pipelines using FeatureUnion
cmp_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])


# Fit the pipeline on the entire DataFrame (unsupervised case)
transfored_df_new =  cmp_pipeline.fit_transform(new_df)



In [113]:
from sklearn.preprocessing import OrdinalEncoder

In [114]:
oe = OrdinalEncoder()
y = oe.fit_transform(df['Target'].values.reshape(-1,1))

In [115]:
print("y shape before reshaping:", y.shape)
y = y.ravel()  # Ensure it's 1D
print("y shape after reshaping:", y.shape)

y shape before reshaping: (748, 1)
y shape after reshaping: (748,)


In [116]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select = 2)
selector = selector.fit(transfored_df_new, y)

In [117]:
selector.support_

array([ True, False,  True, False, False])

In [120]:
from sklearn.feature_selection import SequentialFeatureSelector

In [121]:
%%time
estimator = LogisticRegression()

sfs = SequentialFeatureSelector(estimator, n_features_to_select=2)
sfs.fit_transform(transfored_df_new, y)
print(sfs.get_support())

[False  True False  True False]
CPU times: total: 281 ms
Wall time: 289 ms


In [122]:
%%time
estimator = LogisticRegression()

sfs = SequentialFeatureSelector(estimator, n_features_to_select=2,direction='backward')
sfs.fit_transform(transfored_df_new, y)
print(sfs.get_support())

[False False  True  True False]
CPU times: total: 438 ms
Wall time: 654 ms
