In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from scipy import sparse
from itertools import product
import warnings
warnings.filterwarnings('ignore')

# Read CSV

In [2]:
df=pd.read_csv('zoo.csv')

# Zad 1

In [3]:
def freq(x, prob=True):
    if sparse.issparse(x):  
        x=x.transpose()
        index = pd.Index(np.unique(x.toarray()),name='col1')
        df=pd.DataFrame.sparse.from_spmatrix(x)
    else:
        #input is Series
        df=pd.DataFrame(x)
        index = pd.Index(np.unique(x),name='col1')
        
    
    tmp=pd.Series(index=index,dtype=np.float64)
    
    for val in index:
        tmp[val]=sum(df.iloc[:,0]==val)

    if prob:
        tmp=tmp/sum(tmp)
    
    tmp.name='freq2'
    return tmp


#freq(df['feathers'])
#freq(sparse.coo_array(df['feathers']))

# Zad 2

In [4]:
def freq2(x, y, prob=True):
    if sparse.issparse(x) and sparse.issparse(y):  
        x=x.transpose()
        y=y.transpose()
        data=sparse.hstack([x,y])
        df=pd.DataFrame.sparse.from_spmatrix(data=data,columns=['col1','col2'])
        x=pd.DataFrame.sparse.from_spmatrix(x)
        y=pd.DataFrame.sparse.from_spmatrix(y)
    else:
        #input is Series
        df=pd.DataFrame({'col1':x,'col2':y})
    
    index=[np.unique(x),np.unique(y)]
    index = pd.MultiIndex.from_product(index,names=['col1','col2'])
    tmp=pd.Series(index=index,dtype=np.float64)
    
    for val in index:
        tmp[val]=sum(df.eq(val).all(axis=1))

    if prob:
        tmp=tmp/sum(tmp)
    
    tmp.name='freq2'
    return tmp

freq2(df['feathers'],df['legs'])
freq2(sparse.coo_array(df['feathers']),sparse.coo_array(df['legs']))


col1  col2
0     0       0.227723
      2       0.069307
      4       0.376238
      5       0.009901
      6       0.099010
      8       0.019802
1     0       0.000000
      2       0.198020
      4       0.000000
      5       0.000000
      6       0.000000
      8       0.000000
Name: freq2, dtype: float64

# Zad 3

In [5]:
def entropy(p):
    return np.sum(p*np.log2(1/p))


def infogain(x,y):
    xp=freq(x)
    yp=freq(y)
    xyp=freq2(x,y)
    
    return entropy(xp) + entropy(yp) - entropy(xyp)

    
import scipy.stats._entropy

#TEST
print('Numpy entropy:\t',scipy.stats._entropy.entropy(freq(df['feathers']),base=2))
print('My entropy:\t',entropy( freq(df['feathers'])) )

print('Infogain:\t',infogain(df['feathers'],df['feathers']))

Numpy entropy:	 0.7179499765002912
My entropy:	 0.7179499765002912
Infogain:	 0.7179499765002912


# Zad 4

In [6]:
indexes=pd.MultiIndex.from_product([df.columns[1:-1],['type']])

infos=pd.Series(index=indexes,dtype=np.float64,name='info gains')

for cols in indexes:
    infos.loc[cols]=infogain(df[cols[0]],df[cols[1]])

infos.sort_values(ascending=False,inplace=True)
infos

    

legs      type    1.363047
milk      type    0.974320
toothed   type    0.865694
eggs      type    0.830138
hair      type    0.790675
feathers  type    0.717950
backbone  type    0.676163
breathes  type    0.614494
tail      type    0.500460
airborne  type    0.469703
fins      type    0.466614
aquatic   type    0.389487
catsize   type    0.308490
venomous  type    0.133090
predator  type    0.093447
domestic  type    0.050669
Name: info gains, dtype: float64

# Zad 5

In [7]:
from sklearn.datasets import fetch_rcv1
rcv1=fetch_rcv1(subset='train')

### maska wierszy

In [8]:
atrybut='GSPO'
tindex=(rcv1.target_names==atrybut).nonzero()[0]
rows=rcv1.target[:,tindex].nonzero()[0]

### binaryzacja

In [9]:
data_bin=rcv1.data
mask=data_bin.nonzero()
data_bin[mask[0],mask[1]]=1

### Infogain

In [10]:

reference=8

indexes_to_calc=list(product( np.arange(data_bin.shape[0]), [reference] ))

infos=pd.Series(index=pd.MultiIndex.from_tuples(indexes_to_calc),dtype=np.float64,name='info gains')
to_calc=data_bin[rows,:]


for index in indexes_to_calc[:10]:
    x=to_calc[:,index[0]].transpose()
    y=to_calc[:,index[1]].transpose()
    i=infogain(x,y)
    infos.loc[index]=i
    

In [11]:
infos.sort_values(ascending=False,inplace=True)
#infos.to_csv('infos.csv')

### Porownanie

In [14]:
from time import time
import sys

sample1=data_bin[0,:]
sample2=data_bin[0,:]

pd_object1=pd.Series(sample1.toarray().squeeze())
pd_object2=pd.Series(sample2.toarray().squeeze())

sp_object1=sample1
sp_object2=sample2

t=time()
i1=infogain(pd_object1,pd_object2)
t1=time()-t

t=time()
i2=infogain(sp_object1,sp_object2)
t2=time()-t

print(f'pd size:\t{sys.getsizeof(pd_object1)+sys.getsizeof(pd_object2)} Bytes,\ttime:\t{t1},\tvalue:\t{i1}')
print(f'sp size:\t{sys.getsizeof(sp_object1)+sys.getsizeof(sp_object2)} Bytes,\ttime:\t{t2},\tvalue:\t{i2}')


pd size:	756064 Bytes,	time:	0.04800057411193848,	value:	0.02880500297011144
sp size:	96 Bytes,	time:	1.0898172855377197,	value:	0.02880500297011144
