### Feature Selection Using LASSO

In [439]:
from sklearn import datasets
import pandas as pd

In [440]:
df = pd.read_csv(r"The Data\diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [441]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(df.drop("Outcome", axis=1), df["Outcome"], test_size=0.2, random_state=1)

In [442]:
xTrain.shape, xTest.shape

((614, 8), (154, 8))

In [443]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xTrainScaled = sc.fit_transform(xTrain)
xTestScaled = sc.transform(xTest)

In [444]:
xTrainScaled = pd.DataFrame(xTrainScaled, columns=df.columns[:-1])
xTestScaled = pd.DataFrame(xTestScaled, columns=df.columns[:-1])

In [445]:
xTrainScaled

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1.516591,0.750524,0.564756,1.652015,0.437496,0.795825,0.529526,0.567932
1,1.812018,0.244752,-0.347904,1.020973,-0.678474,1.228654,-0.069689,0.398450
2,0.925736,-0.608739,0.260536,1.273390,0.222886,0.704013,-0.794249,0.991638
3,1.221164,-0.039745,0.463350,-1.250779,-0.678474,-0.896139,-0.167519,2.601722
4,-0.551400,-0.039745,0.361943,1.084077,0.222886,1.031914,-0.760619,-0.364222
...,...,...,...,...,...,...,...,...
609,-0.551400,1.129853,0.260536,0.957869,3.098656,0.992566,-1.008254,-0.279481
610,0.925736,2.078175,-0.956345,0.831660,2.686605,0.271185,1.107339,0.059484
611,2.698300,0.149919,1.071790,-1.250779,-0.678474,1.517206,0.364436,0.737415
612,0.039454,1.572403,0.159129,-1.250779,-0.678474,1.543438,0.046486,-0.618446


In [446]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(xTrainScaled,yTrain)

In [447]:
lasso.coef_

array([ 6.26290474e-02,  1.85968700e-01, -3.50258983e-02, -0.00000000e+00,
       -1.62558777e-04,  8.51781764e-02,  2.83810683e-02,  1.84611338e-02])

In [448]:
import plotly.express as px
import numpy as np
coef = pd.Series(np.abs(lasso.coef_), index=df.columns[:-1])
px.bar(coef.sort_values(ascending=False))

### Feature Selection using Decision Tree

In [449]:
xTrain

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
663,9,145,80,46,130,37.9,0.637,40
712,10,129,62,36,0,41.2,0.441,38
161,7,102,74,40,105,37.2,0.204,45
509,8,120,78,0,0,25.0,0.409,64
305,2,120,76,37,105,39.7,0.215,29
...,...,...,...,...,...,...,...,...
645,2,157,74,35,440,39.4,0.134,30
715,7,187,50,33,392,33.9,0.826,34
72,13,126,90,0,0,43.4,0.583,42
235,4,171,72,0,0,43.6,0.479,26


In [450]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(xTrain, yTrain)

In [451]:
dt.feature_importances_

array([0.06960263, 0.2918808 , 0.11193665, 0.05101589, 0.02995902,
       0.18888446, 0.1375881 , 0.11913246])

In [452]:
px.bar(pd.Series(np.abs(dt.feature_importances_), index=df.columns[:-1]).sort_values(ascending=False))

### Feature Selection using Random Forest Classifier

In [453]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(xTrain,yTrain)

In [454]:
rf.feature_importances_

array([0.08402671, 0.25443981, 0.09331849, 0.07110002, 0.08308813,
       0.15237721, 0.12673174, 0.13491789])

In [455]:
px.bar(pd.Series(np.abs(rf.feature_importances_), index=df.columns[:-1]).sort_values(ascending=False))

### SelectFromModel

In [456]:
model = DecisionTreeClassifier()

In [457]:
from sklearn.feature_selection import SelectFromModel

# sfm = SelectFromModel(model, threshold=0.1)
sfm = SelectFromModel(model, threshold="mean")

In [458]:
sfm.fit(xTrain, yTrain)

In [459]:
sfm.get_support(indices=True)

array([1, 5, 7], dtype=int64)

In [460]:
col = sfm.feature_names_in_[sfm.get_support(indices=True)]
col

array(['Glucose', 'BMI', 'Age'], dtype=object)

In [461]:
xTrainTrans = sfm.transform(xTrain)
pd.DataFrame(xTrainTrans, columns=col)

Unnamed: 0,Glucose,BMI,Age
0,145.0,37.9,40.0
1,129.0,41.2,38.0
2,102.0,37.2,45.0
3,120.0,25.0,64.0
4,120.0,39.7,29.0
...,...,...,...
609,157.0,39.4,30.0
610,187.0,33.9,34.0
611,126.0,43.4,42.0
612,171.0,43.6,26.0


In [462]:
df = pd.read_csv(r"The Data\iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [463]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [464]:
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [465]:
rf = RandomForestClassifier()
rf.fit(x, y)

In [466]:
rf.feature_importances_

array([0.04824027, 0.02058543, 0.43183075, 0.49934356])

In [467]:
x.drop("sepal_width", axis=1, inplace=True)
x

Unnamed: 0,sepal_length,petal_length,petal_width
0,5.1,1.4,0.2
1,4.9,1.4,0.2
2,4.7,1.3,0.2
3,4.6,1.5,0.2
4,5.0,1.4,0.2
...,...,...,...
145,6.7,5.2,2.3
146,6.3,5.0,1.9
147,6.5,5.2,2.0
148,6.2,5.4,2.3


In [468]:
rf = RandomForestClassifier()
rf.fit(x,y)

In [469]:
rf.feature_importances_

array([0.18613791, 0.35502758, 0.45883451])

In [470]:
x.drop("sepal_length", axis=1, inplace=True)
x

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [471]:
rf = RandomForestClassifier()
rf.fit(x,y)

In [472]:
rf.feature_importances_

array([0.59392762, 0.40607238])

In [473]:
x.drop("petal_width", axis=1, inplace=True)
x

Unnamed: 0,petal_length
0,1.4
1,1.4
2,1.3
3,1.5
4,1.4
...,...
145,5.2
146,5.0
147,5.2
148,5.4


### Sklearn RFE

In [474]:
x = df.drop("species", axis=1)
y = df["species"]
y

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [475]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=1)
rfe.fit(x,y)

In [476]:
ranking = rfe.ranking_
ranking

array([3, 4, 1, 2])

In [477]:
for i, j in enumerate(x.columns):
    print(f"{j} : {ranking[i]}")

sepal_length : 3
sepal_width : 4
petal_length : 1
petal_width : 2


### Mutual Info

In [478]:
data = {
    'A': ['a1', 'a2', 'a1', 'a1', 'a2', 'a1', 'a2', 'a2'],
    'B': ['b1', 'b2', 'b2', 'b1', 'b1', 'b2', 'b2', 'b1']
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,a1,b1
1,a2,b2
2,a1,b2
3,a1,b1
4,a2,b1
5,a1,b2
6,a2,b2
7,a2,b1


In [479]:
marginal_prob = pd.crosstab(df["A"], df["B"], margins=True, normalize="all")
marginal_prob

B,b1,b2,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a1,0.25,0.25,0.5
a2,0.25,0.25,0.5
All,0.5,0.5,1.0


In [480]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import load_iris

x,y = load_iris(return_X_y=True)

In [481]:
mi = mutual_info_classif(x,y)

In [482]:
for i, j in enumerate(mi):
    print(f"Mutual Information of Feature {i} is {j}")

Mutual Information of Feature 0 is 0.4963136137886379
Mutual Information of Feature 1 is 0.286784577768465
Mutual Information of Feature 2 is 0.981168085707091
Mutual Information of Feature 3 is 0.9990123467092469


In [497]:
from sklearn.feature_selection import SelectKBest
selector = SelectKBest(mutual_info_classif, k =2)
xTrans = selector.fit_transform(x, y)

print(selector.get_support(indices=True))
cols = load_iris().feature_names
cols

[2 3]


['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

# End