In [1]:
import lemur.datasets as lds
import lemur.metrics as lms
import lemur.plotters as lpl
import lemur.embedders as leb
import boto3
import io
import glob
import pandas as pd

# Current method (EEG)

In [2]:
class BIDSParser:
    def __init__(self, base_path):
        dataset_name = os.path.basename(os.path.normpath(base_path))
        dataset = {}
        subjects = [os.path.basename(x) for x in glob.glob(base_path + "/*")]
        if "chanlocs.csv" in subjects:
            subjects.remove("chanlocs.csv")
        if "metadata.json" in subjects:
            subjects.remove("metadata.json")
#         print('base_path')
#         print (base_path)
#         print('subjects')
#         print (subjects)
        for s in subjects:
            dataset.update({s:{}})
        for s in subjects:
            modalities = [os.path.basename(x) for x in glob.glob(os.path.join(base_path, s) + "/*")]
#             print('modalities')
#             print(modalities)
            for m in modalities:
                dataset[s].update({m:{}})
                files = [os.path.basename(x) for x in glob.glob(os.path.join(base_path, s, m) + "/*")]
#                 print(files)
                for f in files:
                    t = "".join(f.split("_")[1:]).split(".")[0]
                    dataset[s][m].update({t:f})
        self.dataset = dataset
        self.base_path = base_path
#         print(self.dataset)

    def getModalityFrame(self, modality, extension):
        files = []
        subjects = []
        tasks = []
        for s in self.dataset.keys():
            for t in self.dataset[s][modality].keys():
                f = self.dataset[s][modality][t]
                if f.endswith(extension):
                    files.append(os.path.join(self.base_path, s, modality, f))
                    subjects.append(s)
                    tasks.append(t)
#         print (files)
#         print (subjects)
#         print (tasks)
        d = {
            "resource_path": files,
            "subjects": subjects,
            "tasks": tasks        
        }
        return pd.DataFrame(d)

In [9]:
import os
BASE = "/Users/YujiaLiu/Desktop/download_test/data"
DATASET = "eeg"
root = os.path.join(BASE, DATASET)
# root = '/eeg'
# print (root)
bp = BIDSParser(root)
dataset_descriptor = bp.getModalityFrame("preprocessed", ".pkl").iloc[:6]
out_base = os.path.join(BASE, "eeg_derivatives")
out_emb_base = os.path.join(BASE, "eeg_embedded_deriatives")
os.makedirs(out_base + "/agg", exist_ok=True)
os.makedirs(out_emb_base + "/agg", exist_ok=True)
print (dataset_descriptor)

                                       resource_path          subjects  \
0  /Users/YujiaLiu/Desktop/download_test/data/eeg...  sub-NDARAC904DMU   
1  /Users/YujiaLiu/Desktop/download_test/data/eeg...  sub-NDARAA117NEJ   

                  tasks  
0  task-RestingStateeeg  
1  task-RestingStateeeg  


In [16]:
chanlocs = pd.read_csv(root+"/chanlocs.csv")
spatial = lds.DataSet(chanlocs[["X", "Y", "Z"]], "Spatial")
spatialDM = lds.DistanceMatrix(spatial, lms.VectorDifferenceNorm)

In [17]:
eds = lds.EEGDataSet(dataset_descriptor)
# Create a lemur distance matrix based on the EEG data
DM = lds.DistanceMatrix(eds, lms.FroCorr)
DM.name = "eeg-DistanceMatrix"

In [18]:
# Create an embedded distance matrix object under MDS
MDSEmbedder = leb.MDSEmbedder(num_components=10)
EEG_Embedded = MDSEmbedder.embed(DM)
for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    lpl.SparkLinePlotter(single_ds, mode="savediv", base_path=out_base).plot(sample_freq=500)

for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    single_DM = lds.DataSet(single_ds.D.corr(), single_ds.name)
    lpl.SpatialConnectivity(single_DM, mode="savediv",
                            base_path=out_base).plot(spatial)
for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    single_DM = lds.DataSet(single_ds.D.corr(), single_ds.name)
    lpl.ConnectedScatterplot(single_DM,
                             mode="savediv",
                             base_path=out_base).plot(spatialDM)

lpl.SquareHeatmap(DM, mode="savediv", base_path=out_base).plot()
lpl.Heatmap(EEG_Embedded, mode="savediv", base_path=out_emb_base).plot()

lpl.EigenvectorHeatmap(DM, mode="savediv", base_path=out_base).plot()
lpl.EigenvectorHeatmap(EEG_Embedded, mode="savediv",
                       base_path=out_emb_base).plot()


# Read files directly from S3

In [22]:
import os
BASE = '/Users/YujiaLiu/Desktop/test'
DATASET = 'eeg'
root = os.path.join(BASE, DATASET)
out_base = os.path.join(BASE, "eeg_derivatives")
out_emb_base = os.path.join(BASE, "eeg_embedded_deriatives")
os.makedirs(out_base + "/agg", exist_ok=True)
os.makedirs(out_emb_base + "/agg", exist_ok=True)

In [23]:
f = []
s = ['sub-NDARAC904DMU', 'sub-NDARAA117NEJ']
t = ['task-RestingStateeeg', 'task-RestingStateeeg']

s3 = boto3.resource('s3')
bucket = s3.Bucket('redlemurtest')
# Directly read through S3 bucket and pass into pandas dataframe
for obj in bucket.objects.all():
    key = obj.key
    if key.endswith('.pkl'):
        body = obj.get()['Body'].read()
        pkl = pd.read_pickle(io.BytesIO(body))
        f.append(pkl)
    if key.endswith('chanlocs.csv'):
        body = obj.get()['Body'].read()
        chanlocs = pd.read_csv(io.BytesIO(body))
spatial = lds.DataSet(chanlocs[["X", "Y", "Z"]], "Spatial")
spatialDM = lds.DistanceMatrix(spatial, lms.VectorDifferenceNorm)

In [24]:
d = {
            "resource_path": f,
            "subjects": s,
            "tasks": t        
    }
descriptor = pd.DataFrame(d)
print (descriptor)

                                       resource_path          subjects  \
0  [[0.0, 2.4754, 52.105, 37.534, 52.535, 76.26, ...  sub-NDARAC904DMU   
1  [[0.0, 261.64, 375.68, 345.7, 97.827, 381.74, ...  sub-NDARAA117NEJ   

                  tasks  
0  task-RestingStateeeg  
1  task-RestingStateeeg  


# Since we read in the pickle directly, we need to modify several functions.

In [25]:
class DistanceMatrix:
    """A distance matrix computed from a DataSet object.

    Parameters
    ----------
    dataset : :obj:`DiskDataSet`
        A dataset on which to compute the distance matrix
    metric : function
        A distance used to compute the distance matrix.

    Attributes
    ----------
    dataset : :obj:`DiskDataSet`
        A dataset on which to compute the distance matrix
    metric : function
        A distance used to compute the distance matrix.
    N : int
        Number of data points in the dataset.
    matrix : :obj:`ndarray`
        The distance matrix.

    """

    def __init__(self, dataset, metric):
        self.DS = dataset
        self.name = self.DS.name
        self.labels = self.DS.D.index.values
        self.label_name = self.DS.D.index.name
        self.metric = metric
        self.metric_name = metric.__name__
        self.n = self.DS.n
        parameterization = parameterize(self.DS)
        self.D = np.zeros([self.n, self.n])
        for i in range(self.n):
            I = parameterization[i]
            for j in range(i + 1):
                J = parameterization[j]
                self.D[i, j] = self.metric.compare(I, J)
                self.D[j, i] = self.D[i, j]
        self.D = pd.DataFrame(self.D)
        self.D.index = self.DS.D.index
        self.D.index.name = self.DS.D.index.name

    def getMatrix(self):
        """Get the distance matrix.

        Returns
        -------
        :obj:`ndarray`
            The distance matrix.

        """
        return self.D

In [26]:
def parameterize(D):
        """Compute the correlation matrix of a single data point.

        Parameters
        ----------
        D : :obj:`DataSet`
            The lemur data set object to parameterize.

        Returns
        -------
        :obj:`list` of :obj:`ndarray`
            The correlation matrix of each object in the dataset.

        """
        with np.errstate(divide = 'ignore', invalid = 'ignore'):
            return list(map(lambda j: np.nan_to_num(np.corrcoef(D.getMatrix(j))), range(D.n)))


In [27]:
class DataSet:
    def __init__(self, D, name="default"):
        self.D = D
        self.n, self.d = self.D.shape
        self.name = name

    def getResource(self, index):
        return self.D.iloc[index, :]

    def saveMetaData(self, filepath):
        metadata = dict(d=self.d, n=self.n, name=self.name)
        string = json.dumps(metadata, indent=2)
        with open(filepath, 'w') as f:
            f.write(string)
        return string

    def getMatrix(self):
        return self.D.as_matrix()

In [28]:
class EEGDataSet:

    def __init__(self, dataframe_descriptor, name="fmri"):
        self.D = dataframe_descriptor
        self.D.index = self.D["subjects"].astype(str) + "-" + self.D["tasks"].astype(str)
        self.D.index.name = "index"
        self.name = name
        self.n = self.D.shape[0]

    def getResource(self, index):
        resource = self.D.ix[index]
        return resource

    def getMatrix(self, index):
        resource_path = self.D.ix[index][0]
        return resource_path.T
#         with open(resource_path, "rb") as f:
#             return pkl.load(f).T

    def getResourceDS(self, index):
        resource = self.getResource(index)
        matrix = self.getMatrix(index)
        D = pd.DataFrame(matrix.T)
        name = "%s/%s"%(resource[1], resource[2])
        DS = DataSet(D, name)
        return DS

In [29]:
# Use modified functions to read in new dataframe
import numpy as np
eds = EEGDataSet(descriptor)
# Create a lemur distance matrix based on the EEG data
DM = DistanceMatrix(eds, lms.FroCorr)
DM.name = "eeg-DistanceMatrix"



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



In [30]:
# Create an embedded distance matrix object under MDS
MDSEmbedder = leb.MDSEmbedder(num_components=10)
EEG_Embedded = MDSEmbedder.embed(DM)
for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    lpl.SparkLinePlotter(single_ds, mode="savediv", base_path=out_base).plot(sample_freq=500)
for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    single_DM = lds.DataSet(single_ds.D.corr(), single_ds.name)
    lpl.SpatialConnectivity(single_DM, mode="savediv",
                            base_path=out_base).plot(spatial)
for i in range(eds.n):
    single_ds = eds.getResourceDS(i)
    single_DM = lds.DataSet(single_ds.D.corr(), single_ds.name)
    lpl.ConnectedScatterplot(single_DM,
                             mode="savediv",
                             base_path=out_base).plot(spatialDM)

lpl.SquareHeatmap(DM, mode="savediv", base_path=out_base).plot()
lpl.Heatmap(EEG_Embedded, mode="savediv", base_path=out_emb_base).plot()

lpl.EigenvectorHeatmap(DM, mode="savediv", base_path=out_base).plot()
lpl.EigenvectorHeatmap(EEG_Embedded, mode="savediv",
                       base_path=out_emb_base).plot()

In [36]:
for dirname, dirnames, filenames in os.walk('/Users/YujiaLiu/Desktop/test'):
    for filename in filenames:
        if not filename.endswith('DS_Store'):
            print(os.path.join(dirname, filename))

/Users/YujiaLiu/Desktop/test/eeg_derivatives/agg/squareheat.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/agg/evheat.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/spatialconn.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/connectedscatter.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/sparkline.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/spatialconn.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/connectedscatter.html
/Users/YujiaLiu/Desktop/test/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/sparkline.html
/Users/YujiaLiu/Desktop/test/eeg_embedded_deriatives/agg/evheat.html
/Users/YujiaLiu/Desktop/test/eeg_embedded_deriatives/agg/heatmap.html


In [41]:
for dirname, dirnames, filenames in os.walk('/Users/YujiaLiu/Desktop/download_test/data/'):
    for filename in filenames:
        if filename.endswith('.html'):
            print(os.path.join(dirname, filename))

/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/agg/squareheat.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/agg/evheat.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/spatialconn.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/connectedscatter.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAC904DMU/task-RestingStateeeg/sparkline.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/spatialconn.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/connectedscatter.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_derivatives/sub-NDARAA117NEJ/task-RestingStateeeg/sparkline.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_embedded_deriatives/agg/evheat.html
/Users/YujiaLiu/Desktop/download_test/data/eeg_embedded_deriatives/agg/he