In [109]:
import numpy as np
import pandas as pd
import sklearn as sk
from scipy import sparse
from itertools import product

# Read CSV

In [110]:
df=pd.read_csv('zoo.csv')

# Zad 1

In [111]:

def freq(x, prob=True):
    
    if sparse.issparse(x):
        #input is scipy.sparse

        index = pd.Index([0,1],name='col1')
        tmp=pd.Series(index=index,dtype=np.float64)

        for val in index:
            mask=(x==val).toarray().squeeze()
            tmp[val]=sum(mask)

        if prob:
            tmp=tmp/sum(tmp)
         
    else:
        #input is Series
        tmp=x.value_counts(normalize=prob)
    
    
    return tmp


freq(df['feathers'])
freq(sparse.coo_array(df['feathers']))

  mask=(x==val).toarray().squeeze()


col1
0    0.80198
1    0.19802
dtype: float64

# Zad 2

In [112]:
def freq2(x, y, prob=True):
    if sparse.issparse(x) and sparse.issparse(y):  
        #input is scipy.sparse
        
        index = pd.MultiIndex.from_product([np.unique(np.append(x.data,0)),np.unique(np.append(y.data,0))],names=['col1','col2'])
        tmp=pd.Series(index=index,dtype=np.float64)

        for val in index:
            mask=np.logical_and((x==val[0]).toarray(),(y==val[1]).toarray()).squeeze()
            tmp[val]=sum(mask)

        if prob:
            tmp=tmp/sum(tmp)
            
        return tmp
        
    else:
        #input is Series
        
        df=pd.DataFrame({'col1':x,'col2':y})
        
        index = pd.MultiIndex.from_product([np.unique(x),np.unique(y)],names=['col1','col2'])
        tmp=pd.Series(index=index,dtype=np.float64)
        
        for val in index:
            tmp[val]=sum(df.eq(val).all(axis=1))

        if prob:
            tmp=tmp/sum(tmp)
    
    
        
    return tmp

freq2(df['feathers'],df['legs'])
freq2(sparse.coo_array(df['feathers']),sparse.coo_array(df['legs']))


  mask=np.logical_and((x==val[0]).toarray(),(y==val[1]).toarray()).squeeze()


col1  col2
0     0       0.227723
      2       0.069307
      4       0.376238
      5       0.009901
      6       0.099010
      8       0.019802
1     0       0.000000
      2       0.198020
      4       0.000000
      5       0.000000
      6       0.000000
      8       0.000000
dtype: float64

# Zad 3

In [113]:
def entropy(p):
    return np.sum(p*np.log2(1/p))


def infogain(x,y):
    xp=freq(x)
    yp=freq(y)
    xyp=freq2(x,y)
    
    return entropy(xp) + entropy(yp) - entropy(xyp)

    
import scipy.stats._entropy

#TEST
print('Numpy entropy:\t',scipy.stats._entropy.entropy(freq(df['feathers']),base=2))
print('My entropy:\t',entropy( freq(df['feathers'])) )

print('Infogain:\t',infogain(df['feathers'],df['tail']))

Numpy entropy:	 0.7179499765002912
My entropy:	 0.7179499765002912
Infogain:	 0.09668209882120649


# Zad 4

In [114]:
indexes=pd.MultiIndex.from_product([df.columns[1:-1],['type']])

infos=pd.Series(index=indexes,dtype=np.float64,name='info gains')

for cols in indexes:
    infos.loc[cols]=infogain(df[cols[0]],df[cols[1]])

infos.sort_values(ascending=False,inplace=True)
infos

    

legs      type    1.363047
milk      type    0.974320
toothed   type    0.865694
eggs      type    0.830138
hair      type    0.790675
feathers  type    0.717950
backbone  type    0.676163
breathes  type    0.614494
tail      type    0.500460
airborne  type    0.469703
fins      type    0.466614
aquatic   type    0.389487
catsize   type    0.308490
venomous  type    0.133090
predator  type    0.093447
domestic  type    0.050669
Name: info gains, dtype: float64

# Zad 5

In [115]:
from sklearn.datasets import fetch_rcv1
rcv1=fetch_rcv1(subset='train')

### maska wierszy

In [116]:
atrybut='GSPO'
tindex=(rcv1.target_names==atrybut).nonzero()[0]
rows=rcv1.target[:,tindex].nonzero()[0]

### binaryzacja

In [117]:
data_bin=rcv1.data
mask=data_bin.nonzero()
data_bin[mask[0],mask[1]]=1

### Infogain

In [118]:
reference=0

indexes_to_calc=list(product( np.arange(data_bin.shape[0]), [reference] ))

infos=pd.Series(index=pd.MultiIndex.from_tuples(indexes_to_calc),dtype=np.float64,name='info gains')

for index in indexes_to_calc[:10]:
    infos.loc[index]=infogain(data_bin[index[0],:],data_bin[index[1],:])
    

  xp=freq(x)
  yp=freq(y)
  xyp=freq2(x,y)


In [119]:
print(infos.sort_values(ascending=False))

0      0    0.028805
6      0    0.005132
1      0    0.002982
5      0    0.002646
7      0    0.002340
              ...   
23144  0         NaN
23145  0         NaN
23146  0         NaN
23147  0         NaN
23148  0         NaN
Name: info gains, Length: 23149, dtype: float64


### Porownanie

In [120]:
from time import time
import sys

sample1=data_bin[0,:]
sample2=data_bin[0,:]

pd_object1=pd.Series(sample1.toarray().squeeze())
pd_object2=pd.Series(sample2.toarray().squeeze())

sp_object1=sample1
sp_object2=sample2

t=time()
i1=infogain(pd_object1,pd_object2)
t1=time()-t

t=time()
i2=infogain(sp_object1,sp_object2)
t2=time()-t

print(f'pd size:\t{sys.getsizeof(pd_object1)+sys.getsizeof(pd_object2)} Bytes,\ttime:\t{t1},\tvalue:\t{i1}')
print(f'sp size:\t{sys.getsizeof(sp_object1)+sys.getsizeof(sp_object2)} Bytes,\ttime:\t{t2},\tvalue:\t{i2}')


  xp=freq(x)
  yp=freq(y)
  xyp=freq2(x,y)


pd size:	756064 Bytes,	time:	0.019976377487182617,	value:	0.02880500297011144
sp size:	96 Bytes,	time:	0.7714474201202393,	value:	0.02880500297011144
