In [9]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import KMeans
from collections import Counter


In [11]:
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

df = read_csv(url, names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'])
df.head(5)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
#shape of the dataframe

df.shape 

(768, 9)

In [12]:
X = df.drop("class",axis=1) #independent variables
Y = df['class'] #dependentvariable

# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(model.feature_importances_)

[0.12012035 0.24311108 0.10067102 0.07559913 0.07379978 0.13512803
 0.12949831 0.1220723 ]


## A. Simulating data 

In [32]:
def clusterBasedOnSimulatingdata(N,P,S,K):

    X_new = X.iloc[:N,:S] #passing number of rows as 'N' and number of first columns as 'S'
    print(X_new)
    kmeans = KMeans(n_clusters=K, n_jobs=-1).fit(X_new)
    
    return kmeans
    

In [33]:
kmeans_sim = clusterBasedOnSimulatingdata(300,8,4,4)

     preg  plas  pres  skin
0       6   148    72    35
1       1    85    66    29
2       8   183    64     0
3       1    89    66    23
4       0   137    40    35
..    ...   ...   ...   ...
295     6   151    62    31
296     2   146    70    38
297     0   126    84    29
298    14   100    78    25
299     8   112    72     0

[300 rows x 4 columns]


## B. Scoring column in data 

In [13]:
keys = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']

col_dict = dict(zip(keys,model.feature_importances_))
col_dict

{'preg': 0.12012034785660351,
 'plas': 0.24311108251692845,
 'pres': 0.1006710203313006,
 'skin': 0.0755991284630031,
 'test': 0.07379978497981339,
 'mass': 0.13512802775246438,
 'pedi': 0.12949830935611545,
 'age': 0.12207229874377126}

In [27]:
def clusterBasedOnScore(N,P,S,K):
    
    c = Counter(col_dict).most_common(S)
    col_names = [key for key, val in c]
      
    X_new = X[col_names]
    X_new = X_new[:N]
    print(X_new)
    kmeans = KMeans(n_clusters=K, n_jobs=-1).fit(X_new)
    
    return kmeans
    

In [28]:
kmeans_final = clusterBasedOnScore(500,8,3,3)

     plas  mass   pedi
0     148  33.6  0.627
1      85  26.6  0.351
2     183  23.3  0.672
3      89  28.1  0.167
4     137  43.1  2.288
..    ...   ...    ...
495   166  26.6  0.304
496   110  26.0  0.292
497    81  30.1  0.547
498   195  25.1  0.163
499   154  29.3  0.839

[500 rows x 3 columns]


## Steps performed

1. Here we have performed clustering using K means using two approaches.
2. In first, we are simply selecting top rows and columns,
3. In the second approach, we are proceeding with scoring each column and based on score we are selecting those columns.