# Features selection
---

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import time
np.set_printoptions(suppress=True)
import operator
import time

# Freq, Freq2, Unique Dict

In [2]:
def generateUniqueDict(X):
    (unique, counts) = np.unique(X, return_counts=True)
    output = dict()
    for index, name in enumerate(unique):
        output[name] = index
    return output


In [16]:
def freq(X, prob=True):
    (unique, counts) = np.unique(X, return_counts=True)
    if prob:
        return unique, counts / (np.size(X))
    return unique, counts
    


def freq2(X, Y, prob=True):
    if scipy.sparse.issparse(X):
        pij = np.zeros((2,2))
        xy, x_ind, y_ind = np.intersect1d(X.indices, Y.indices, return_indices=True)
        if prob:
            pij[1, 1] = (xy.shape[0])/X.shape[0]
            pij[0, 1] = (-xy.shape[0] + Y.indices.shape[0]) / X.shape[0]
            pij[1, 0] = (-xy.shape[0] + X.indices.shape[0]) / X.shape[0]
            pij[0, 0] = 1 - np.sum(pij)       
        else:
            pij[1, 1] = (xy.shape[0])
            pij[0, 1] = (-xy.shape[0] + Y.indices.shape[0])
            pij[1, 0] = (-xy.shape[0] + X.indices.shape[0])
            pij[0, 0] = X.shape[0] - np.sum(pij)      
        return pij
    else:
        dictX = generateUniqueDict(X)
        dictY = generateUniqueDict(Y)
        nij = np.zeros((len(dictX), len(dictY)))
        for Xvalue, Yvalue in zip(X, Y):
            nij[dictX[Xvalue], dictY[Yvalue]] += 1
        if prob:
            return nij/np.sum(nij)
        return nij

# Entropy and infogain

In [11]:
def entropy(pij):
    pij += 1e-16
    return -np.sum(pij * np.log2(pij))

In [12]:
def infoGain(pij):
    return entropy(np.sum(pij, axis=0)) + entropy(np.sum(pij, axis=1)) - entropy(pij)

# Kappa, gini and ginigain

In [13]:
def kappa(pij):
    return infoGain(pij) / entropy(np.sum(pij, axis=0))

def gini(pij):
    return 1 - np.sum(np.square(pij))

def giniGain(pij):
    suma = 0
    k = pij.shape[0]
    for i in range(k):
        pi = np.sum(pij[i, :])
        
        suma += gini(pij[i, :]/pi)*pi
    return gini(np.sum(pij, axis=0)) - suma

# Zoo.csv


In [14]:
zoo_df = pd.read_csv('zoo.csv')
zoo_df["legs"] = zoo_df["legs"] > 2
Y = zoo_df["type"]
info_gain_zoo = {}
gini_gain_zoo = {}

for column in zoo_df.columns[1:-1]:
    uniqueX, uniqueY, pij = freq2(zoo_df[column], Y)
    info_gain_zoo[column] = infoGain(pij)
    gini_gain_zoo[column] = giniGain(pij)
    

### Zoo - infoGain

In [15]:
dict(sorted(info_gain_zoo.items(), key=operator.itemgetter(1),reverse=True))

{'milk': 0.9743197211096528,
 'toothed': 0.8656941534932092,
 'eggs': 0.830138448363325,
 'hair': 0.7906745736101515,
 'feathers': 0.7179499765002539,
 'backbone': 0.6761627418828824,
 'breathes': 0.6144940279390272,
 'legs': 0.5303795375579381,
 'tail': 0.5004604482514798,
 'airborne': 0.4697026095047505,
 'fins': 0.46661356715035707,
 'aquatic': 0.3894874837982054,
 'catsize': 0.30849034491426863,
 'venomous': 0.13308962953511028,
 'predator': 0.09344704054082831,
 'domestic': 0.050668779845500556}

### Zoo - giniGain

In [9]:
dict(sorted(gini_gain_zoo.items(), key=operator.itemgetter(1),reverse=True))

{'milk': 0.2930006862072322,
 'eggs': 0.2623459742469322,
 'hair': 0.23785606188112696,
 'feathers': 0.20854890769604761,
 'toothed': 0.19175900466202866,
 'breathes': 0.13313505678714754,
 'backbone': 0.12283752531282988,
 'airborne': 0.11945904632895898,
 'legs': 0.10662522513171346,
 'fins': 0.09975665592185101,
 'catsize': 0.09484192296248717,
 'tail': 0.07208674865963194,
 'aquatic': 0.07126928229761087,
 'venomous': 0.02610512568343848,
 'predator': 0.011136785531450921,
 'domestic': 0.009077678521848709}







# Reuters

In [17]:
from scipy.sparse import coo_matrix
from sklearn.datasets import fetch_rcv1
from sklearn.utils import resample
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix

In [18]:
rcv1 = fetch_rcv1()

In [19]:
rcv1.target_names

array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
       'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
       'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
       'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
       'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
       'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
       'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
       'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
       'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
       'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
       'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
       'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
       'M142', 'M143', 'MCAT'], dtype=object)

In [20]:
X = rcv1.data
Y = rcv1.target[:, 89]

In [21]:
X = X>0

In [22]:
X = X.tocsc()
Y = Y.tocsc()

In [23]:
X

<804414x47236 sparse matrix of type '<class 'numpy.bool_'>'
	with 60915113 stored elements in Compressed Sparse Column format>

In [17]:
X.shape

(804414, 47236)

In [18]:
Y.shape

(804414, 1)

In [19]:
Y.indices.shape[0]

32615

## Sparse matrix - validation

In [20]:
freq2(X[:, 0], Y, prob=False)

array([[771720.,  32615.],
       [    79.,      0.]])

In [21]:
freq2(X[:, 0], Y, prob=False).sum()

804414.0

In [22]:
freq2(X[:, 0], Y, prob=True)

array([[0.95935675, 0.04054504],
       [0.00009821, 0.        ]])

In [23]:
freq2(X[:, 0], Y, prob=True).sum()

1.0

# Main loop

In [24]:
kappa_values = []
info_gain_values = []
gini_gain_values = []
t0 = time.time()
for i in range(X.shape[1]):
    pij = freq2(X[:, i], Y, True)
    kappa_values.append(kappa(pij))
    info_gain_values.append(infoGain(pij))    
    gini_gain_values.append(giniGain(pij))
t1 = time.time()
print(f"Czas: {t1-t0} sekund")

Czas: 40.23620367050171 sekund


## Load column names

In [25]:
names = pd.read_csv("dane.txt", sep=" ", header=None)
names

Unnamed: 0,0,1,2
0,_,1,10.056509
1,_____,2,8.447071
2,______,3,8.447071
3,_______,4,8.447071
4,____________,5,8.447071
...,...,...,...
47231,zynax,47232,10.056509
47232,zyprex,47233,10.056509
47233,zyrech,47234,9.363362
47234,zyuganov,47235,8.447071


## Top 30 by Info Gain

In [26]:
info_gain_indexes = np.array(info_gain_values).argsort()[::-1][:30]

In [27]:
for i in info_gain_indexes:
    print(names.iloc[i, 0], ": ", info_gain_values[i])

rebel :  0.0485791461342458
milit :  0.046896090743684504
kill :  0.04420070014972033
troop :  0.037755524347903024
forc :  0.03523924387133781
army :  0.0349295789270756
attack :  0.033536408584291466
guerrill :  0.03350810842148294
peac :  0.032279020915216394
peopl :  0.03092751871814725
fight :  0.028632749859780315
war :  0.026442784449019596
violent :  0.025927740511501707
soldy :  0.025783267109595387
bomb :  0.023376174679855533
town :  0.02301883649201225
arm :  0.02193027303463757
told :  0.020698193188331437
govern :  0.02064174741489777
polic :  0.018967922791001346
newsroom :  0.0189620271094042
protest :  0.01820241087481822
wound :  0.017882082743843897
civil :  0.017655816651594325
mile :  0.016447133256494495
km :  0.01618650546051381
presid :  0.015999236901444225
offic :  0.015929284911100905
polit :  0.015345306369505596
zair :  0.015179078422745018


## Top 30 by Gini Gain

In [28]:
gini_gain_indexes = np.array(gini_gain_values).argsort()[::-1][:30]

In [29]:
for i in gini_gain_indexes:
    print(names.iloc[i, 0], ": ", gini_gain_values[i])

rebel :  0.016665209902267483
milit :  0.012669550571428462
guerrill :  0.012284420969846555
troop :  0.011979463224383924
kill :  0.011813403876488549
army :  0.010410380003016959
attack :  0.008827816061498281
peac :  0.008664214311680504
soldy :  0.008596449533179756
violent :  0.007888371165926625
bomb :  0.007513588523458542
fight :  0.0072102865711003855
forc :  0.006985486095982263
wound :  0.0059241618642858485
town :  0.005749718521779135
war :  0.005708207576439431
arm :  0.005376015468844614
peopl :  0.005343720889144338
zair :  0.0053162611539297605
refug :  0.004641364335312523
ceasefir :  0.004583103329156471
protest :  0.004560685827708064
rwand :  0.004477729382004539
civil :  0.004279748559459978
zairean :  0.004205323646924747
tuts :  0.004157096249955555
km :  0.004139260658117552
mile :  0.004040691032257515
terror :  0.0040029641242528
command :  0.003917827525482531


## Top 30 by Kappa

In [30]:
kappa_indexes = np.array(kappa_values).argsort()[::-1][:30]

In [31]:
for i in kappa_indexes:
    print(names.iloc[i, 0], ": ", kappa_values[i])

rebel :  0.19845587140078558
milit :  0.19158024161456785
kill :  0.18056901289489433
troop :  0.15423913512087437
forc :  0.14395960831969257
army :  0.14269456289905044
attack :  0.13700317356044645
guerrill :  0.13688756153516637
peac :  0.13186648456090494
peopl :  0.12634531822590497
fight :  0.1169707122541959
war :  0.10802425006795269
violent :  0.10592018893136156
soldy :  0.10532998516799524
bomb :  0.09549651414802009
town :  0.09403671365549487
arm :  0.08958970651973028
told :  0.08455640521677396
govern :  0.08432581254388684
polic :  0.07748789235093953
newsroom :  0.07746380727077992
protest :  0.07436061765627976
wound :  0.07305201090985759
civil :  0.0721276670693603
mile :  0.06718994511436104
km :  0.06612522659873137
presid :  0.06536019575662087
offic :  0.0650744273908809
polit :  0.06268875412212001
zair :  0.0620096785382399
