## Get all imports

In [1]:
import math
import numpy as np
import pandas as pd
from collections import Counter
from PyImpetus import CPIMB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import time

## Load data

In [2]:
# Read the data
df = pd.read_csv("ionosphere.data", header=None)
# A pre-processing step for this particular data
df[34] = df[34].str.replace("g", '1').replace("b", '0')
display(df.head())
# Dropping the target variable from the dataframe and also, saving target variable as a numpy array
data, Y = df.drop([34], axis=1), df[34].values
print("Data shape: ", data.shape, "Target Variable shape: ", Y.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


Data shape:  (351, 34) Target Variable shape:  (351,)


## Check score with decision tree using PyImpetus

In [3]:
kfold = KFold(n_splits=5, random_state=27, shuffle=True)
scores = list()
for train, test in kfold.split(data):
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]
    
    # Initialize feature selection module
    model = CPIMB(model=LogisticRegression(random_state=27), p_val_thresh=0.05, num_simul=50, cv=10, verbose=2, random_state=27, n_jobs=-1)
    # Find the MB and prune the dataset
    x_train = model.fit_transform(x_train, y_train)
    # Prune the test set
    x_test = model.transform(x_test)
    # You can check out the features selected in MB using the "MB" attribute
    print("Markov Blanket: ", model.MB)
    # Converting the pandas dataframe to numpy for sklearn
    x_train, x_test = x_train.values, x_test.values
    
    model = DecisionTreeClassifier(random_state=27)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print("Score: ", score)
print("\n\nAverage Accuracy: ", sum(scores)/len(scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   22.7s remaining:   22.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   23.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Markov Blanket:  [7, 0, 4, 6, 2]
Score:  0.9295774647887324


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   25.5s remaining:   25.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Markov Blanket:  [7, 21, 6, 0, 4, 2]
Score:  0.9428571428571428


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   24.8s remaining:   24.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   25.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Markov Blanket:  [7, 0, 6, 2, 4]
Score:  0.8857142857142857


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   25.5s remaining:   25.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Markov Blanket:  [7, 2, 0, 4]
Score:  0.8857142857142857


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   26.0s remaining:   26.0s


Markov Blanket:  [7, 0, 6, 4, 2]
Score:  0.9428571428571428


Average Accuracy:  0.9173440643863179


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   27.1s finished


## Check score with decision tree without PyImpetus

In [4]:
kfold = KFold(n_splits=5, random_state=27, shuffle=True)
scores = list()
for train, test in kfold.split(data):
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]
    
    model = DecisionTreeClassifier(random_state=27)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print("Score: ", score)
print("\n\nAverage Accuracy: ", sum(scores)/len(scores))

Score:  0.9436619718309859
Score:  0.9142857142857143
Score:  0.8285714285714286
Score:  0.8428571428571429
Score:  0.8714285714285714


Average Accuracy:  0.8801609657947687


### Final Accuracy with PyImpetus: 0.9173440643863179
### Final Accuracy w/o PyImpetus: 0.8801609657947687