## Initialise

In [1]:
from IPython.core.display import HTML
HTML("""
<style>
.container {
    width: 100%;
}
</style>
""")

In [2]:
import graphlab as gl
gl.canvas.set_target('ipynb')
import tools 
import initialise

2016-03-15 01:29:52,214 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.4 started. Logging: /tmp/graphlab_server_1458005381.log


This non-commercial license of GraphLab Create is assigned to kevin.mcisaac@gmail.com and will expire on November 06, 2016. For commercial licensing options, visit https://dato.com/buy/.


# Constant method

In [3]:
train_data, test_data = initialise.prepare_data(redundant=False, categorical=False, clean=False)

Loading raw data from CSV files
train: 76020    test: 75818
370 raw features
366 Columns translated to English
370 features in total


In [82]:
%timeit drop_col = [ col for col in train_data.column_names() if train_data[col].var() == 0]

1 loops, best of 3: 1.03 s per loop


In [5]:
%%timeit
drop_col = [ col for col in train_data.column_names() if (abs(train_data[col] - train_data[col][0]).sum() == 0)]
 

1 loops, best of 3: 4.21 s per loop


In [6]:
%%timeit
drop_col = [ col for col in train_data.column_names() if all(train_data[col] - train_data[col][0] == 0)]

1 loops, best of 3: 6.61 s per loop


In [81]:
%timeit drop_col = [ col for col in train_data.column_names() if not (train_data[col] - train_data[col][0] != 0).any()]

1 loops, best of 3: 4.32 s per loop


In [14]:
sa_const = gl.SArray(data = [1 for _ in xrange(100000000)])
sa_rand =  gl.SArray(data = [i for i in xrange(100000000)])

In [95]:
print sa_const.var() == 0
print not ((sa_const - sa_const[0]) != 0).any()

print sa_rand.var() == 0
print not ((sa_rand - sa_rand[0]) != 0).any()

True
True
True
False
False
False


In [15]:
#time how long it take to determing this is a constant
%timeit sa_const.var() == 0
%timeit not ((sa_const - sa_const[0]) != 0).any()
print

#time how long it take to determing this is NOT constant. If any() does early termination and lazy evaluation this iwll be much quicker
%timeit sa_rand.var() == 0
%timeit not ((sa_rand - sa_rand[0]) != 0).any()

1 loops, best of 3: 441 ms per loop
1 loops, best of 3: 1.23 s per loop

1 loops, best of 3: 478 ms per loop
1 loops, best of 3: 1.26 s per loop


# correlated columns

In [9]:
from scipy.stats.stats import pearsonr
import itertools
float_cols = tools.features_type(float(), train_data)
pairs = itertools.combinations(float_cols, 2)

In [18]:
pairs = itertools.combinations(float_cols, 2)
pairs = list(pairs)[0:100]

In [19]:
%%timeit 
correlated_cols = [(col2,col1) for col1, col2, in pairs if abs(pearsonr(train_data[col1], train_data[col2])[0]) > 0.99]
print correlated_cols 

[('var41 option amount ultima1', 'var39 option amount ultima1')]
[('var41 option amount ultima1', 'var39 option amount ultima1')]
[('var41 option amount ultima1', 'var39 option amount ultima1')]
[('var41 option amount ultima1', 'var39 option amount ultima1')]
1 loops, best of 3: 5.65 s per loop


In [22]:
%%timeit
train_data['var15']*train_data['var15'].sum()

100 loops, best of 3: 3.04 ms per loop


In [23]:
%%timeit
train_data['var15'].apply(lambda x: x**2).sum()

The slowest run took 54.10 times longer than the fastest. This could mean that an intermediate result is being cached 
1 loops, best of 3: 74.7 ms per loop


In [43]:
def cov_sa(sa1, sa2):
    '''covariance'''
    n = float(len(sa1))
    
    sum1 = sa1.sum()
    sum2 = sa2.sum()
    sum12 = (sa1*sa2).sum()
    
    return (sum12 - sum1*sum2 / n) / n  

def cov_sa2(sa1, sa2):
    '''naive algorithm for covariance'''
    
    return ((sa1-sa1.mean())*(sa2-sa2.mean())).sum()/float(len(sa1))

#runs slightly slower!

def pearsonr_sa(sa1, sa2):
    # Assume len(x) == len(y)
    
    std1=sa1.std()
    std2=sa2.std()
    if std1 == 0 or std2 == 0:
        return 0
    
    return cov_sa(sa1, sa2)/(std1*std2)

In [67]:
%timeit   pearsonr(train_data['var15'], train_data['var15'])
%timeit   pearsonr_sa(train_data['var15'], train_data['var15'])
%timeit   train_data['var15'].std()

10 loops, best of 3: 56.8 ms per loop
100 loops, best of 3: 11.9 ms per loop
100 loops, best of 3: 2.5 ms per loop


In [70]:
%timeit   pearsonr(train_data['var15'], train_data['var38'])
%timeit   pearsonr_sa(train_data['var15'], train_data['var38'])
%timeit   train_data['var38'].std()

10 loops, best of 3: 58.1 ms per loop
100 loops, best of 3: 12 ms per loop
100 loops, best of 3: 2.18 ms per loop


In [74]:
print pearsonr(train_data['var15'], train_data['var38'])[0], pearsonr_sa(train_data['var15'], train_data['var38'])

0.006496590103 0.00649659010295


In [73]:
print pearsonr(train_data['var15'], train_data['var38'])[0] - pearsonr_sa(train_data['var15'], train_data['var38'])

5.39464306559e-14


In [75]:
print pearsonr(train_data['var15'], train_data['var15'])[0] - pearsonr_sa(train_data['var15'], train_data['var15'])

1.22124532709e-15
