# Checksum for EEG data

In [1]:
# import libraries 
import glob
import os

import pandas as pd
import numpy as np

#import mne
import hashlib
import h5py
#import fnmatch
import warnings
import re

warnings.filterwarnings('ignore')

The following notebook demonstrates the use of a hash as a checksum on EEG data. However we do not assume the data is uncorrupted, or has proper EEG format. We just read in files with the raw  data read as a string. 

Step 1: Let's pair each of our files (by the content inside) with a hash, and throw the info in a dataframe

In [2]:
def hash_it_up_right(origin_folder1):
    hash_list = []
    file_names = []
    non_suspects1 = glob.glob(os.path.join(origin_folder1, '*.cnt'))
    BUF_SIZE = 65536 
    sha256 = hashlib.sha256()
    for file in non_suspects1:
        with open(file, 'rb') as f:
            while True:
                data = f.read(BUF_SIZE)
                if not data:
                    break
                sha256.update(data)

        result = sha256.hexdigest()
        hash_list.append(result)
        file_names.append(file)
        
    df = pd.DataFrame(hash_list, file_names)
    df.columns = ["hash"]
    df = df.reset_index() 
    df = df.rename(columns = {'index':'file_name'})
    
    return df

Step 2: compare with premade checksum list. The hashes for each file must match or the file has been altered.

Let's do an example: 

In [3]:
mother_right = hash_it_up_right('C:/Projects/mother-eeg-data/*/Data/')

In [4]:
mother_right

Unnamed: 0,file_name,hash
0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...
1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...
2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...
3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...
4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...
...,...,...
303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...
304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...
305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...
306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...


Now let's say we have a file of the known hashes called mother_df1.csv:

In [5]:
mother_df1 = pd.read_csv('mother_df1.csv')
mother_df1

Unnamed: 0.1,Unnamed: 0,file_name,hash
0,0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...
1,1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...
2,2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...
3,3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...
4,4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...
...,...,...,...
303,303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...
304,304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...
305,305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...
306,306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...


In [6]:
results = mother_right.merge(mother_df1, on='file_name', how='outer',)
results

Unnamed: 0.1,file_name,hash_x,Unnamed: 0,hash_y
0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...,0,0d854854def2f7aac80df64027885d28c698ac753f664f...
1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,1,b9e53cd9807eeece442018ecbd45854e0da19f13922988...
2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,2,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...
3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,3,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...
4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,4,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...
...,...,...,...,...
303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...,303,04b9f2317aa321750751471650247909db4d0af625990b...
304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...,304,cade53c907f6897f23caa787385a40cd3a17cca276d337...
305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,305,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...
306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,306,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...


In [7]:

results['compare'] = np.where(results.hash_x == results.hash_y, 'True', 'False')
results


Unnamed: 0.1,file_name,hash_x,Unnamed: 0,hash_y,compare
0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...,0,0d854854def2f7aac80df64027885d28c698ac753f664f...,True
1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,1,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,True
2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,2,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,True
3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,3,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,True
4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,4,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,True
...,...,...,...,...,...
303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...,303,04b9f2317aa321750751471650247909db4d0af625990b...,True
304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...,304,cade53c907f6897f23caa787385a40cd3a17cca276d337...,True
305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,305,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,True
306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,306,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,True


Ok, that worked out fine, but what if our check-sum table had been different?

In [8]:
altered_check_sum = mother_df1
altered_check_sum['hash'] = altered_check_sum['hash']+'1'

In [9]:
results = mother_right.merge(altered_check_sum, on='file_name', how='outer',)

results

Unnamed: 0.1,file_name,hash_x,Unnamed: 0,hash_y
0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...,0,0d854854def2f7aac80df64027885d28c698ac753f664f...
1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,1,b9e53cd9807eeece442018ecbd45854e0da19f13922988...
2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,2,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...
3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,3,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...
4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,4,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...
...,...,...,...,...
303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...,303,04b9f2317aa321750751471650247909db4d0af625990b...
304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...,304,cade53c907f6897f23caa787385a40cd3a17cca276d337...
305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,305,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...
306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,306,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...


In [10]:
results['compare'] = np.where(results.hash_x == results.hash_y, 'True', 'False')
results

Unnamed: 0.1,file_name,hash_x,Unnamed: 0,hash_y,compare
0,C:/Projects/mother-eeg-data\1646832910497-Earl...,0d854854def2f7aac80df64027885d28c698ac753f664f...,0,0d854854def2f7aac80df64027885d28c698ac753f664f...,False
1,C:/Projects/mother-eeg-data\1646832910497-Earl...,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,1,b9e53cd9807eeece442018ecbd45854e0da19f13922988...,False
2,C:/Projects/mother-eeg-data\1646832910497-Earl...,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,2,263789094e3078e970daafa44edfa8ded6e351a2c75f7a...,False
3,C:/Projects/mother-eeg-data\1646833329075-Earl...,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,3,e69f34a77510079aaa4763d93c3d50ab08b4430d839a9c...,False
4,C:/Projects/mother-eeg-data\1646833329075-Earl...,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,4,05973fa3a972a3b22a4aabca66ef3bdb69a3952cf26b3b...,False
...,...,...,...,...,...
303,C:/Projects/mother-eeg-data\1646857503973-Earl...,04b9f2317aa321750751471650247909db4d0af625990b...,303,04b9f2317aa321750751471650247909db4d0af625990b...,False
304,C:/Projects/mother-eeg-data\1646857661935-Earl...,cade53c907f6897f23caa787385a40cd3a17cca276d337...,304,cade53c907f6897f23caa787385a40cd3a17cca276d337...,False
305,C:/Projects/mother-eeg-data\1646857661935-Earl...,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,305,df70c07e3e067c2dbefc6f120f41333f2d38e4c6fa2ee3...,False
306,C:/Projects/mother-eeg-data\1646857661935-Earl...,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,306,448a9cbc36da934efccce6ef6d184a2f3bf8e270aafa88...,False


## Now you understand how we will run a check_sum algorithm on based on a hash function

### But notice we assumed that the files would have the same name. 

To see how to run this to check if files named differently check out the sorting_out_files notebook