##### General steps performed:
- Load bit strings from files
- Create DataFrame with MultiIndex
- Transform DataFrame to create string pairings for all files
- Create DataFrame's with string comparison values

---

---

In [1]:
import os
import sys
from difflib import SequenceMatcher
import collections
import xml.etree.ElementTree as ET
#import itertools

#import json
#import requests

import numpy as np
import pandas as pd

from bitstring import Bits, BitArray, ConstBitStream, BitStream

import jellyfish as jf

#import matplotlib.pyplot as plt
#%matplotlib inline

#---------------------------------------------------------------------------------------
print ('python version: ', sys.version_info, '\n')
#---------------------------------------------------------------------------------------

# the following points to local directory due to size of files (didn't want to clog my Drive)...
print (os.getcwd())   # C:\Users\Reid\Google Drive\projects\bitstrings\media
os.chdir(os.path.join(os.path.expanduser('~'),'projects'))
print (os.getcwd())   # C:\Users\Reid\projects\bitstrings\media
print ()

#---------------------------------------------------------------------------------------

# set media_dir and create list of files in directory
media_dir = './bitstrings/media/'

file_name_list = [f for f in os.listdir(media_dir) if os.path.isfile(os.path.join(media_dir, f))]

#remove Thumbs.db
file_name_list.remove('Thumbs.db')
#---------------------------------------------------------------------------------------
file_bitarrays = {}
file_extensions = []

python version:  sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0) 

C:\Users\Reid\Google Drive\projects\bitstrings\notebooks
C:\Users\Reid\projects



---
##### Load bit strings from files

In [2]:
load_approach = 0
nbytes = 32*8 # 256 bits aka 32 bytes
if load_approach == 1:
    for file in file_name_list:
        file_name, file_ext = file.rsplit(sep='.', maxsplit=1)
        #print('', file_ext, ' | ', file_ext.lower())
        file_ext = file_ext.lower()
        if file_ext == 'jpeg':
            file_ext = 'jpg'
        file_extensions.append(file_ext.lower())
        if file_ext in file_bitarrays.keys():
            with open((media_dir+file), 'rb') as f:
                packet = ConstBitStream(bytes = f.read(nbytes), length = nbytes*8)
                while(packet.pos < nbytes*8):
                    byte = packet.read(8).uint
                    file_bitarrays[file_ext].append([file_name,chr(byte)])
        else:
            with open((media_dir+file), 'rb') as f:
                packet = ConstBitStream(bytes = f.read(nbytes), length = nbytes*8)
                while(packet.pos < nbytes*8):
                    byte = packet.read(8).uint
                    file_bitarrays[file_ext] = ([file_name,chr(byte)])
else:
    for file in file_name_list:
        file_name, file_ext = file.rsplit(sep='.', maxsplit=1)
        file_ext = file_ext.lower()
        if file_ext == 'jpeg':
            file_ext = 'jpg'
        file_extensions.append(file_ext.lower())
        if file_ext in file_bitarrays.keys():
            with open((media_dir+file), 'rb') as f:
                file_bitarrays[file_ext].append([file_name,BitArray(bytes=f.read(),length=nbytes)])
        else:
            with open((media_dir+file), 'rb') as f:
                file_bitarrays[file_ext] = ([[file_name,BitArray(bytes=f.read(),length=nbytes)]])

print ('Loaded first ', nbytes,' ',\
       'for ',len(file_extensions),' files ',\
       'from ',media_dir,' directory, ',\
       'representing ',len(file_bitarrays),' file types\n',sep='')

print ('File types and counts:')
print (pd.Series(np.array(file_extensions)).value_counts())

Loaded first 256 for 30 files from ./bitstrings/media/ directory, representing 15 file types

File types and counts:
jpg    3
mov    2
wmv    2
mod    2
pdf    2
ino    2
svg    2
3mf    2
png    2
mp4    2
avi    2
pak    2
m4a    2
nef    2
3gp    1
dtype: int64


---
##### Create DataFrame with MultiIndex

In [3]:
test_dict={}

for k, v in file_bitarrays.items():
    for key, value in v:
        test_dict.update({(key,k):str(value)})
        
df = pd.DataFrame.from_dict(test_dict,orient='index')

fe = []  # list for file extensions
fn = []  # list for file names
for x in range(len(df.index)):
    fe.append(df.index[x][1])
    fn.append(df.index[x][0])
array = [fe,fn]

tuples = list(zip(*array))
index = pd.MultiIndex.from_tuples(tuples, names=['file_type', 'file_name'])
df = pd.DataFrame(df[0].tolist(), index=index)
print (df.shape)
print ('Preview of df:')
df[:5]

(30, 1)
Preview of df:


Unnamed: 0_level_0,Unnamed: 1_level_0,0
file_type,file_name,Unnamed: 2_level_1
svg,logo_64x64,0x3c3f786d6c2076657273696f6e3d22312e302220656e...
jpg,SUNP0001,0xffd8ffe1187f45786966000049492a00080000000a00...
mod,MOV065,0x000001ba4400040004010189c3f8000001bb001280c4...
ino,Final_Sketch,0x2f2f2044485420636f64652066726f6d3a0a2f2f2045...
avi,SUNP0002,0x5249464660045500415649204c495354ec7f00006864...


---
##### Transform DataFrame to create string pairings for all files
_consider transforming in place if memory is an issue_

In [4]:
dfx = pd.DataFrame([df[0].tolist()] * df.shape[0], index=df.index, columns=df.index)  # make columns for each file
dfx = dfx.applymap(lambda x: list([x]))  # make cell values into lists
dfp = dfx + dfx.T  # create string pairings for each row/column intersection
#dfp.mask(np.triu(np.ones(dfp.shape)).astype(bool))  # If desired, cut off upper triangle of df / array.

---
##### Create DataFrame's with string comparison values
- jf.levenshtein_distance [link](https://jellyfish.readthedocs.io/en/latest/comparison.html#levenshtein-distance) (other methods available)
- SequenceMatcher [link](https://docs.python.org/3.5/library/difflib.html#difflib.SequenceMatcher.ratio)

_*some models might require that comparison values are normalized or otherwise adjusted prior to modeling_

In [5]:
lev_distance_df = dfp.applymap(lambda x: jf.levenshtein_distance(x[0], x[1]))
print ('Preview of :')
lev_distance_df

Preview of :


Unnamed: 0_level_0,file_type,svg,jpg,mod,ino,avi,mod,3mf,jpg,m4a,pdf,...,mp4,pdf,mp4,svg,png,nef,3gp,mov,pak,mov
Unnamed: 0_level_1,file_name,logo_64x64,SUNP0001,MOV065,Final_Sketch,SUNP0002,MOV06C,Left Curve Track,WP_20150307_22_46_02_Pro,mss_v110_theme4,Webinar_Kelloggs-NCS_Handout_12-9-14,...,download,DoubleVerify_2H2010TrustIndexFinal,Iguazu Falls d,icon-exoticpets,mem7,DSC_1180,apt_scouting,Video,1,IMG_1153
file_type,file_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
svg,logo_64x64,0,56,58,49,51,58,59,53,51,52,...,56,52,54,2,61,61,55,52,60,53
jpg,SUNP0001,56,0,48,54,49,48,43,38,49,57,...,51,57,48,56,40,36,52,46,43,45
mod,MOV065,58,48,0,57,48,0,45,46,51,56,...,48,56,50,58,47,40,48,46,46,45
ino,Final_Sketch,49,54,57,0,52,57,53,52,49,48,...,51,48,50,48,57,58,53,51,58,51
avi,SUNP0002,51,49,48,52,0,48,45,49,49,51,...,53,51,55,51,50,48,53,50,51,47
mod,MOV06C,58,48,0,57,48,0,45,46,51,56,...,48,56,50,58,47,40,48,46,46,45
3mf,Left Curve Track,59,43,45,53,45,45,0,49,53,54,...,53,54,52,59,34,32,54,48,44,49
jpg,WP_20150307_22_46_02_Pro,53,38,46,52,49,46,49,0,46,58,...,50,58,51,53,47,41,49,43,46,44
m4a,mss_v110_theme4,51,49,51,49,49,51,53,46,0,55,...,31,55,25,51,52,48,32,37,44,34
pdf,Webinar_Kelloggs-NCS_Handout_12-9-14,52,57,56,48,51,56,54,58,55,0,...,54,0,55,51,48,53,58,55,53,55


In [6]:
seq_match_df = dfp.applymap(lambda x: SequenceMatcher(None, x[0], x[1]).ratio())
print ('Preview of :')
seq_match_df

Preview of :


Unnamed: 0_level_0,file_type,svg,jpg,mod,ino,avi,mod,3mf,jpg,m4a,pdf,...,mp4,pdf,mp4,svg,png,nef,3gp,mov,pak,mov
Unnamed: 0_level_1,file_name,logo_64x64,SUNP0001,MOV065,Final_Sketch,SUNP0002,MOV06C,Left Curve Track,WP_20150307_22_46_02_Pro,mss_v110_theme4,Webinar_Kelloggs-NCS_Handout_12-9-14,...,download,DoubleVerify_2H2010TrustIndexFinal,Iguazu Falls d,icon-exoticpets,mem7,DSC_1180,apt_scouting,Video,1,IMG_1153
file_type,file_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
svg,logo_64x64,1.0,0.242424,0.19697,0.318182,0.19697,0.19697,0.181818,0.30303,0.318182,0.348485,...,0.227273,0.348485,0.272727,0.969697,0.242424,0.151515,0.227273,0.212121,0.151515,0.19697
jpg,SUNP0001,0.181818,1.0,0.393939,0.272727,0.363636,0.393939,0.409091,0.393939,0.30303,0.257576,...,0.348485,0.257576,0.257576,0.181818,0.439394,0.454545,0.30303,0.5,0.348485,0.5
mod,MOV065,0.151515,0.393939,1.0,0.090909,0.242424,1.0,0.424242,0.318182,0.424242,0.242424,...,0.318182,0.242424,0.30303,0.151515,0.393939,0.469697,0.378788,0.363636,0.424242,0.363636
ino,Final_Sketch,0.333333,0.257576,0.19697,1.0,0.333333,0.19697,0.257576,0.318182,0.363636,0.318182,...,0.363636,0.318182,0.363636,0.363636,0.242424,0.181818,0.242424,0.272727,0.181818,0.287879
avi,SUNP0002,0.257576,0.363636,0.242424,0.318182,1.0,0.242424,0.242424,0.242424,0.121212,0.333333,...,0.121212,0.333333,0.121212,0.257576,0.227273,0.19697,0.121212,0.121212,0.121212,0.121212
mod,MOV06C,0.151515,0.393939,1.0,0.090909,0.242424,1.0,0.424242,0.318182,0.424242,0.242424,...,0.318182,0.242424,0.30303,0.151515,0.393939,0.469697,0.378788,0.363636,0.424242,0.363636
3mf,Left Curve Track,0.212121,0.393939,0.363636,0.257576,0.242424,0.363636,1.0,0.348485,0.242424,0.227273,...,0.378788,0.227273,0.409091,0.212121,0.530303,0.560606,0.378788,0.333333,0.393939,0.348485
jpg,WP_20150307_22_46_02_Pro,0.181818,0.393939,0.212121,0.333333,0.257576,0.212121,0.348485,1.0,0.30303,0.212121,...,0.363636,0.212121,0.348485,0.181818,0.409091,0.454545,0.348485,0.333333,0.30303,0.333333
m4a,mss_v110_theme4,0.30303,0.30303,0.424242,0.378788,0.121212,0.424242,0.242424,0.348485,1.0,0.151515,...,0.363636,0.151515,0.681818,0.318182,0.257576,0.227273,0.621212,0.530303,0.333333,0.530303
pdf,Webinar_Kelloggs-NCS_Handout_12-9-14,0.333333,0.287879,0.227273,0.363636,0.242424,0.227273,0.212121,0.30303,0.212121,1.0,...,0.257576,1.0,0.227273,0.30303,0.318182,0.181818,0.212121,0.166667,0.272727,0.166667


---
### Next steps:  refine similarity matrices, perform clustering...

- scipy.cluster [link](https://docs.scipy.org/doc/scipy/reference/cluster.html) (scipy.cluster.hierarchy in particular)
- scikit-learn [link](http://scikit-learn.org/stable/modules/clustering.html#clustering)
- NLTK [link](http://www.nltk.org/py-modindex.html) might not apply here but worth documenting [found this post particularly illuminating](https://nbviewer.jupyter.org/github/brandomr/document_cluster/blob/master/cluster_analysis_web.ipynb)


---
Other References:
- Carnegie Mellon Statistical Machine Learning course [page](http://www.stat.cmu.edu/~larry/=sml/) | [notes](http://www.stat.cmu.edu/~larry/=sml/clustering.pdf)
- Kept coming across [ELKI](https://github.com/elki-project/elki) but only in Java