## LIBs

In [5]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

## DATA

In [7]:
from sklearn.datasets import load_breast_cancer

In [9]:
cancer_data = load_breast_cancer()
cancer_data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [11]:
type(cancer_data)

sklearn.utils._bunch.Bunch

In [17]:
cancer_data['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [19]:
x = pd.DataFrame(cancer_data['data'], columns = cancer_data['feature_names'])

In [21]:
x.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [27]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [23]:
y = cancer_data['target']
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [25]:
x_train , x_test , y_train,y_test = train_test_split(x,y,
                                                    test_size = 0.2,
                                                    random_state = 0)

## Modeling -- Normal

In [32]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89        47
           1       0.94      0.90      0.92        67

    accuracy                           0.90       114
   macro avg       0.90      0.91      0.90       114
weighted avg       0.91      0.90      0.90       114



## Dimension Reduction -- PCA

In [37]:
from sklearn.decomposition import PCA

In [91]:
pca = PCA(n_components = 10)

In [93]:
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [95]:
x_train.shape

(455, 30)

In [97]:
x_train_pca.shape

(455, 10)

In [99]:
x_train_pca

array([[-6.04753682e+02, -3.88235725e+01,  1.27036258e+01, ...,
        -5.35811897e-01,  1.58740280e-01,  1.66681516e-01],
       [-4.88463997e+02, -5.10471286e+01,  9.05491820e+00, ...,
         8.51387188e-02, -2.50357093e-01,  7.50227379e-02],
       [ 1.31695999e+02,  9.07258194e+01, -2.87732561e+01, ...,
         2.46682158e-01, -1.18298990e-01,  7.43518748e-02],
       ...,
       [-5.72905390e+02, -9.48487001e+01,  2.61978133e+01, ...,
        -2.36640162e-01, -7.48525341e-01, -2.60257844e-01],
       [-6.85294859e+02, -1.68240161e+01,  2.22812726e+01, ...,
        -2.51499394e+00,  3.15282155e-01,  2.47090317e+00],
       [-4.79252970e+02, -5.75300937e+00,  3.78706109e+00, ...,
        -8.97344174e-01,  3.58874875e-01,  3.67488569e-01]])

In [101]:
pca.explained_variance_ratio_.sum()

0.9999998983098962

In [105]:
dt = DecisionTreeClassifier()
dt.fit(x_train_pca,y_train)
y_pred = dt.predict(x_test_pca)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93        47
           1       0.94      0.97      0.96        67

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.95       114
weighted avg       0.95      0.95      0.95       114



## Feature Selection

### information gain

In [109]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

In [111]:
info_gain = mutual_info_classif(cancer_data.data,y)

In [113]:
info_gain

array([0.36789808, 0.09876649, 0.40026236, 0.36148704, 0.07858237,
       0.21411731, 0.37549666, 0.44051476, 0.06337517, 0.00883341,
       0.24911511, 0.        , 0.27840228, 0.34158723, 0.01576275,
       0.0742335 , 0.11677917, 0.12640403, 0.01528343, 0.03848493,
       0.45509905, 0.12205528, 0.47915329, 0.46364935, 0.0981962 ,
       0.22275955, 0.31633898, 0.43740072, 0.08895295, 0.06686489])

In [115]:
for feature , gain in enumerate(info_gain):
    print(f'info_gain for Feature {feature}: ',gain)

info_gain for Feature 0:  0.3678980774875371
info_gain for Feature 1:  0.09876648918726172
info_gain for Feature 2:  0.40026236306698304
info_gain for Feature 3:  0.361487036273334
info_gain for Feature 4:  0.07858236978784672
info_gain for Feature 5:  0.2141173142724906
info_gain for Feature 6:  0.37549666314864494
info_gain for Feature 7:  0.4405147627676511
info_gain for Feature 8:  0.06337516600938153
info_gain for Feature 9:  0.0088334052351271
info_gain for Feature 10:  0.2491151061498924
info_gain for Feature 11:  0.0
info_gain for Feature 12:  0.2784022768974206
info_gain for Feature 13:  0.3415872262563968
info_gain for Feature 14:  0.015762745804742018
info_gain for Feature 15:  0.07423349863672701
info_gain for Feature 16:  0.11677917023956152
info_gain for Feature 17:  0.12640403014375945
info_gain for Feature 18:  0.015283430031240997
info_gain for Feature 19:  0.03848492661269698
info_gain for Feature 20:  0.4550990491133575
info_gain for Feature 21:  0.12205528403665644


In [141]:
k = 5
top_k = np.argsort(info_gain)[-k:]
print(top_k)

[27  7 20 23 22]


In [143]:
x_train , x_test , y_train,y_test = train_test_split(x.values,y,
                                                    test_size = 0.2,
                                                    random_state = 0)

In [145]:
x_train_gain = x_train[:,top_k]
x_test_gain  =x_test[:,top_k]

In [149]:
dt = DecisionTreeClassifier()
dt.fit(x_train_gain,y_train)
y_pred = dt.predict(x_test_gain)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        47
           1       0.97      0.97      0.97        67

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



### 2- Chi-2 Test

In [154]:
from sklearn.feature_selection import chi2 , SelectKBest

In [157]:
chi_sq = SelectKBest(chi2, k = 5)                      

In [161]:
x_train_chi = chi_sq.fit_transform(x_train,y_train)
x_test_chi = chi_sq.transform(x_test)

In [163]:
dt = DecisionTreeClassifier()
dt.fit(x_train_chi,y_train)
y_pred = dt.predict(x_test_chi)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        47
           1       1.00      0.94      0.97        67

    accuracy                           0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114



In [165]:
x_train_chi

array([[ 64.41, 310.8 ,  16.85,  71.98, 384.  ],
       [ 68.79, 359.9 ,  20.2 ,  83.69, 489.5 ],
       [104.3 , 800.  ,  21.83, 115.9 , 947.9 ],
       ...,
       [ 59.82, 278.6 ,  30.48,  75.79, 439.6 ],
       [ 60.73, 288.1 ,  21.69,  62.25, 303.8 ],
       [ 74.52, 403.5 ,  16.97,  82.28, 474.2 ]])

In [167]:
features = chi_sq.get_support(indices = True)

In [169]:
features

array([ 2,  3, 13, 22, 23], dtype=int64)

In [173]:
cancer_data['feature_names'][features]

array(['mean perimeter', 'mean area', 'area error', 'worst perimeter',
       'worst area'], dtype='<U23')

### 3 - Fisher's Score 

In [180]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA

In [182]:
lda = LinearDiscriminantAnalysis(n_components = 1)  #n_components = n_classes -1

In [186]:
x_train_lda= lda.fit_transform(x_train,y_train)
x_test_lda = lda.transform(x_test)

In [188]:
x_train_lda

array([[-2.07001845],
       [-0.62082166],
       [-0.43720889],
       [-1.61228271],
       [-0.74309534],
       [-1.89632029],
       [-0.40453781],
       [-1.83844638],
       [ 1.49977352],
       [-0.60088194],
       [ 4.02929963],
       [-0.25609167],
       [ 0.05771645],
       [-2.04316215],
       [-2.18245395],
       [-2.31857982],
       [ 1.99807578],
       [-1.63157206],
       [-1.99980507],
       [-0.22097197],
       [-0.16316866],
       [-2.92878114],
       [ 0.08925773],
       [ 0.10009211],
       [ 3.91842384],
       [-2.48307566],
       [-1.60856734],
       [-1.1841597 ],
       [ 4.07152369],
       [-1.02692867],
       [-1.64673126],
       [ 3.10448622],
       [-1.16113199],
       [ 0.48822364],
       [-1.3672863 ],
       [-1.40757071],
       [-1.43952906],
       [-1.26611178],
       [-0.69405747],
       [-0.53417834],
       [-2.59021546],
       [ 1.5209398 ],
       [-0.53716989],
       [ 1.23218288],
       [-1.78190376],
       [ 3

In [190]:
dt = DecisionTreeClassifier()
dt.fit(x_train_lda,y_train)
y_pred = dt.predict(x_test_lda)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        47
           1       0.96      0.97      0.96        67

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



### 4 - Variance Threshold

In [193]:
from sklearn.feature_selection import VarianceThreshold

In [196]:
var = VarianceThreshold(threshold = 0.9)

In [198]:
x_train_v = var.fit_transform(x_train)
x_test_v = var.transform(x_test)

In [200]:
x_train_v

array([[ 10.05 ,  17.53 ,  64.41 , ...,  26.84 ,  71.98 , 384.   ],
       [ 10.8  ,  21.98 ,  68.79 , ...,  32.04 ,  83.69 , 489.5  ],
       [ 16.14 ,  14.86 , 104.3  , ...,  19.58 , 115.9  , 947.9  ],
       ...,
       [  9.436,  18.32 ,  59.82 , ...,  25.02 ,  75.79 , 439.6  ],
       [  9.72 ,  18.22 ,  60.73 , ...,  20.83 ,  62.25 , 303.8  ],
       [ 11.51 ,  23.93 ,  74.52 , ...,  37.16 ,  82.28 , 474.2  ]])

In [202]:
dt = DecisionTreeClassifier()
dt.fit(x_train_v,y_train)
y_pred = dt.predict(x_test_v)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90        47
           1       0.94      0.91      0.92        67

    accuracy                           0.91       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.91      0.91      0.91       114



### 5 - MAD

In [205]:
def mad(x):
    mean = np.mean(x)
    return np.mean(np.abs(x-mean))

In [207]:
x = cancer_data.data

In [211]:
mad_val = np.apply_along_axis(mad,axis = 0,arr = x)

In [213]:
mad_val

array([2.75188776e+00, 3.38496465e+00, 1.90325780e+01, 2.63483384e+02,
       1.11609886e-02, 4.11052402e-02, 6.25558942e-02, 3.14597998e-02,
       2.11457581e-02, 5.30645353e-03, 1.89284531e-01, 4.08740215e-01,
       1.33252531e+00, 2.71965065e+01, 2.12290566e-03, 1.31384253e-02,
       1.86080085e-02, 4.52792317e-03, 5.81924415e-03, 1.66415113e-03,
       3.78985656e+00, 4.91182490e+00, 2.64654371e+01, 4.23879453e+02,
       1.79555995e-02, 1.19683744e-01, 1.64699576e-01, 5.49961343e-02,
       4.48711210e-02, 1.34096668e-02])

In [227]:
threshold = 5
s_f = np.where(mad_val >threshold)[0]

In [229]:
s_f

array([ 2,  3, 13, 22, 23], dtype=int64)