In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join("..")))

# Pyproj issues

Trying to work out why the hashes of my "redistributed" data were different between using my home PC running Windows 10, and an (old) office machine running Debian linux.

These tests are performed on the same code and input data.  Running Anaconda Python 3.6.0 on Windows 10, and Anaconda Python 3.6.1 on Ubuntu in a virtual machine.

- Same version of `pyproj`
- The hashes below confirm that the same data is loaded
- However, we see that the projected points are different.
- After saving and reloading, we see that this is because of tiny differences in the numbers.
- Also replicated on a native Ubuntu install of Python 3.5 (same version of `pyproj`).

In [2]:
import os, bz2, hashlib, io, pickle
import numpy as np
import opencrimedata.san_francisco as sf
import pyproj
pyproj.__version__

'1.9.5.1'

In [5]:
#datadir = os.path.join("/media", "disk", "Data")
#datadir = os.path.join("..", "..", "..", "..", "Data")
#datadir = os.path.join("e:\\", "Data")
datadir = os.path.join("..", "..", "Downloads")
def gen():
    filename = os.path.join(datadir, "SF_Police_Department_Incidents.csv.bz2")
    with bz2.open(filename, "rt", encoding="UTF8") as f:
        yield from sf.load(f)

next(gen())

Row(category='NON-CRIMINAL', description='LOST PROPERTY', datetime=datetime.datetime(2015, 1, 19, 14, 0), block='18TH ST / VALENCIA ST', point=(-122.42158168137, 37.7617007179518), idd='15006027571000', incident='150060275')

In [6]:
proj = sf.projector()

In [7]:
points = np.asarray([row.point for row, _ in zip(gen(), range(10000))])
xcs, ycs = proj(*points.T)
ppoints = np.asarray([xcs, ycs]).T

In [8]:
def hash_array(ar):
    with io.BytesIO() as f:
        np.save(f, ar, allow_pickle=False)
        out = f.getvalue()
    msg = hashlib.sha256()
    msg.update(out)
    return msg.hexdigest()

In [15]:
hash_array(points)

'39a2e5dc8eeb2e5d3fab8aaad5ae8fdfb52fdbb5f2de45b1d8584ca5a095eef2'

In [16]:
hash_array(ppoints)

'3faf28b8ac71960285499e82f35e2ab3ccadd1278689659fea351589f37a2b17'

In [None]:
with bz2.open("sf_proj_win.pic.bz2", "w") as f:
    pickle.dump(ppoints, f)

# And on linux

In [9]:
hash_array(points)

'39a2e5dc8eeb2e5d3fab8aaad5ae8fdfb52fdbb5f2de45b1d8584ca5a095eef2'

In [10]:
hash_array(ppoints)

'ac46917d55a7defb0db5986536d188810bd48541d954d9a685d7b0ec5fa2edbe'

In [11]:
with bz2.open("sf_proj_linux.pic.bz2", "w") as f:
    pickle.dump(ppoints, f)

# Compare

In [3]:
with bz2.open("sf_proj_win.pic.bz2") as f:
    win_pp = pickle.load(f)
with bz2.open("sf_proj_linux.pic.bz2") as f:
    linux_pp = pickle.load(f)    

In [5]:
win_pp - linux_pp

array([[  0.00000000e+00,   1.39698386e-09],
       [  0.00000000e+00,   1.39698386e-09],
       [  0.00000000e+00,   1.39698386e-09],
       ..., 
       [  0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00]])