In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [201]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector

In [160]:
df = pd.read_csv("Week3_GA_dataset.csv")

In [161]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,Target
0,2.0,50.0,12500.0,98.0,NEGATIVE,YES
1,0.0,13.0,3250.0,28.0,NEGATIVE,YES
2,?,?,4000.0,35.0,NEGATIVE,YES
3,?,20.0,5000.0,45.0,NEGATIVE,YES
4,1.0,24.0,6000.0,77.0,NEGATIVE,NO


In [162]:
df['V5'].value_counts()

V5
NEGATIVE    748
Name: count, dtype: int64

In [163]:
df.replace("?", np.nan, inplace = True)

In [164]:
df['V1'] = pd.to_numeric(df['V1'])
df['V2'] = pd.to_numeric(df['V2'])

In [165]:
obj_col = df.select_dtypes(include = "object").columns
obj_col

Index(['V5', 'Target'], dtype='object')

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      743 non-null    float64
 1   V2      743 non-null    float64
 2   V3      748 non-null    float64
 3   V4      748 non-null    float64
 4   V5      748 non-null    object 
 5   Target  748 non-null    object 
dtypes: float64(4), object(2)
memory usage: 35.2+ KB


In [167]:
df.isna().sum()

V1        5
V2        5
V3        0
V4        0
V5        0
Target    0
dtype: int64

In [168]:
df.fillna(df.mean(numeric_only=True), inplace = True)

In [169]:
df.describe()

Unnamed: 0,V1,V2,V3,V4
count,748.0,748.0,748.0,748.0
mean,9.562584,5.464334,1378.676471,34.282086
std,8.066394,5.780316,1459.826781,24.376714
min,0.0,1.0,250.0,2.0
25%,3.0,2.0,500.0,16.0
50%,8.0,4.0,1000.0,28.0
75%,14.0,7.0,1750.0,50.0
max,74.0,50.0,12500.0,98.0


In [170]:
num_cols = df.select_dtypes(exclude=object).columns
cat_cols = df.select_dtypes(include=object).columns

In [171]:
cat_cols

Index(['V5', 'Target'], dtype='object')

In [172]:
num_cols

Index(['V1', 'V2', 'V3', 'V4'], dtype='object')

In [173]:
impute_step = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'), [0, 1]),
        ('passthrough', 'passthrough', [2, 3])
    ]
)

num_pipeline = Pipeline(steps=[
    ('imputation_layer', impute_step),
    ('scaler', StandardScaler())
])

In [181]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_branch', num_pipeline, [0, 1, 2, 3]), 
        ('cat_branch', OrdinalEncoder(), [4])
    ]
)

In [182]:
final_pipeline = Pipeline(steps=[
    ('preprocessing_and_union', preprocessor),
    ('variance_selector', VarianceThreshold(threshold=0.1))
])

In [183]:
X = df.drop(columns=["Target"])

X_transformed = final_pipeline.fit_transform(X)
X_transformed.shape

(748, 4)

In [187]:
df_new = pd.DataFrame(X_transformed, columns = final_pipeline.get_feature_names_out())
df_new.head()

Unnamed: 0,num_branch__imputer__V1,num_branch__imputer__V2,num_branch__passthrough__V3,num_branch__passthrough__V4
0,-0.938169,7.709867,7.623346,2.615633
1,-1.186278,1.304549,1.282738,-0.257881
2,0.0,-1.537585e-16,1.796842,0.029471
3,0.0,2.516366,2.482313,0.439973
4,-1.062223,3.208833,3.167784,1.753579


In [191]:
y = df["Target"] 
encoder = OrdinalEncoder()

In [197]:

y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).ravel()

estimator = LogisticRegression()
rfe = RFE(estimator=estimator, n_features_to_select=2)

rfe.fit(X_transformed, y_encoded)


feature_names = final_pipeline.get_feature_names_out()

top_two_features = feature_names[rfe.support_]

print("The two most important features are:")
print(top_two_features)

The two most important features are:
['num_branch__imputer__V1' 'num_branch__passthrough__V3']


In [200]:
top_feature_indices = np.where(rfe.support_)[0]
print(top_feature_indices.tolist())

[0, 2]


In [202]:
estimator = LogisticRegression()

sfs_backward = SequentialFeatureSelector(
    estimator=estimator, 
    n_features_to_select=2, 
    direction='backward'
)

sfs_backward.fit(X_transformed, y_encoded)

selected_indices = sfs_backward.get_support(indices=True)

selected_indices.tolist()

[2, 3]