In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from scipy import sparse
from itertools import product

# Read CSV

In [2]:
df=pd.read_csv('zoo.csv')

# Zad 1

In [3]:

def freq(x, prob=True):
    
    if sparse.issparse(x):
        x=pd.DataFrame.sparse.from_spmatrix(x.transpose())
         
    tmp=x.value_counts(normalize=prob)
    
    return tmp


freq(df['feathers'])
freq(sparse.coo_array(df['feathers']))

0       0.80198
True    0.19802
dtype: float64

# Zad 2

In [4]:
def freq2(x, y, prob=True):
    if sparse.issparse(x) and sparse.issparse(y):  
        df=pd.DataFrame.sparse.from_spmatrix(sparse.hstack([x.transpose(),y.transpose()]),columns=['col1','col2'])
        x=pd.DataFrame.sparse.from_spmatrix(x.transpose())
        y=pd.DataFrame.sparse.from_spmatrix(y.transpose())
    else:
        #input is Series
        df=pd.DataFrame({'col1':x,'col2':y})
        
    index = pd.MultiIndex.from_product([np.unique(x),np.unique(y)],names=['col1','col2'])
    tmp=pd.Series(index=index,dtype=np.float64)
    
    for val in index:
        tmp[val]=sum(df.eq(val).all(axis=1))

    if prob:
        tmp=tmp/sum(tmp)
    
    
    return tmp

freq2(df['feathers'],df['legs'])
freq2(sparse.coo_array(df['feathers']),sparse.coo_array(df['legs']))


col1  col2
0     0       0.227723
      2       0.069307
      4       0.376238
      5       0.009901
      6       0.099010
      8       0.019802
1     0       0.000000
      2       0.198020
      4       0.000000
      5       0.000000
      6       0.000000
      8       0.000000
dtype: float64

# Zad 3

In [5]:
def entropy(p):
    return np.sum(p*np.log2(1/p))


def infogain(x,y):
    xp=freq(x)
    yp=freq(y)
    xyp=freq2(x,y)
    
    return entropy(xp) + entropy(yp) - entropy(xyp)

    
import scipy.stats._entropy

#TEST
print('Numpy entropy:\t',scipy.stats._entropy.entropy(freq(df['feathers']),base=2))
print('My entropy:\t',entropy( freq(df['feathers'])) )

print('Infogain:\t',infogain(df['feathers'],df['tail']))

Numpy entropy:	 0.7179499765002912
My entropy:	 0.7179499765002912
Infogain:	 0.09668209882120649


# Zad 4

In [6]:
indexes=pd.MultiIndex.from_product([df.columns[1:-1],['type']])

infos=pd.Series(index=indexes,dtype=np.float64,name='info gains')

for cols in indexes:
    infos.loc[cols]=infogain(df[cols[0]],df[cols[1]])

infos.sort_values(ascending=False,inplace=True)
infos

    

legs      type    1.363047
milk      type    0.974320
toothed   type    0.865694
eggs      type    0.830138
hair      type    0.790675
feathers  type    0.717950
backbone  type    0.676163
breathes  type    0.614494
tail      type    0.500460
airborne  type    0.469703
fins      type    0.466614
aquatic   type    0.389487
catsize   type    0.308490
venomous  type    0.133090
predator  type    0.093447
domestic  type    0.050669
Name: info gains, dtype: float64

# Zad 5

In [7]:
from sklearn.datasets import fetch_rcv1
rcv1=fetch_rcv1(subset='train')

### maska wierszy

In [8]:
atrybut='GSPO'
tindex=(rcv1.target_names==atrybut).nonzero()[0]
rows=rcv1.target[:,tindex].nonzero()[0]

### binaryzacja

In [9]:
data_bin=rcv1.data
mask=data_bin.nonzero()
data_bin[mask[0],mask[1]]=1

### Infogain

In [10]:
reference=0

indexes_to_calc=list(product( np.arange(data_bin.shape[0]), [reference] ))

infos=pd.Series(index=pd.MultiIndex.from_tuples(indexes_to_calc),dtype=np.float64,name='info gains')

for index in indexes_to_calc[:100]:
    infos.loc[index]=infogain(data_bin[index[0],:],data_bin[index[1],:])
    

In [11]:
infos.sort_values(ascending=False).head(50)

0   0    0.028805
6   0    0.005132
11  0    0.004746
63  0    0.004495
28  0    0.004058
12  0    0.003878
82  0    0.003838
49  0    0.003294
48  0    0.003294
90  0    0.003203
22  0    0.003170
1   0    0.002982
98  0    0.002961
75  0    0.002899
36  0    0.002875
88  0    0.002828
38  0    0.002809
65  0    0.002772
91  0    0.002742
52  0    0.002702
50  0    0.002702
54  0    0.002697
53  0    0.002697
5   0    0.002646
10  0    0.002599
51  0    0.002546
7   0    0.002340
33  0    0.002252
17  0    0.002121
78  0    0.002087
39  0    0.002003
86  0    0.001926
68  0    0.001862
3   0    0.001862
58  0    0.001862
2   0    0.001862
9   0    0.001852
96  0    0.001709
40  0    0.001615
67  0    0.001608
21  0    0.001606
74  0    0.001601
30  0    0.001585
32  0    0.001572
31  0    0.001572
57  0    0.001508
26  0    0.001444
84  0    0.001413
79  0    0.001380
25  0    0.001369
Name: info gains, dtype: float64

### Porownanie

In [12]:
from time import time
import sys

sample1=data_bin[0,:]
sample2=data_bin[0,:]

pd_object1=pd.Series(sample1.toarray().squeeze())
pd_object2=pd.Series(sample2.toarray().squeeze())

sp_object1=sample1
sp_object2=sample2

t=time()
i1=infogain(pd_object1,pd_object2)
t1=time()-t

t=time()
i2=infogain(sp_object1,sp_object2)
t2=time()-t

print(f'pd size:\t{sys.getsizeof(pd_object1)+sys.getsizeof(pd_object2)} Bytes,\ttime:\t{t1},\tvalue:\t{i1}')
print(f'sp size:\t{sys.getsizeof(sp_object1)+sys.getsizeof(sp_object2)} Bytes,\ttime:\t{t2},\tvalue:\t{i2}')


pd size:	756064 Bytes,	time:	0.01892232894897461,	value:	0.02880500297011144
sp size:	96 Bytes,	time:	0.024849414825439453,	value:	0.02880500297011144
