In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np

# Explore the sophus dataset

## Let's load the data for each dataset from matfiles

In [2]:
X1_data = sio.loadmat("../data/data-sophus/X1_data.mat")
X2_data = sio.loadmat("../data/data-sophus/X2_data.mat")
X3_data = sio.loadmat("../data/data-sophus/X3_data.mat")

Then we create a DataFrame for each dataset

In [3]:
X1 = pd.DataFrame(X1_data["X1_data"])
X2 = pd.DataFrame(X2_data["X2_data"])
X3 = pd.DataFrame(X3_data["X3_data"])

## Let's load the labels

In [4]:
X1_labels = sio.loadmat("../data/data-sophus/X1_label.mat")
X2_labels = sio.loadmat("../data/data-sophus/X2_label.mat")
X3_labels = sio.loadmat("../data/data-sophus/X3_label.mat")

### Fixing the formating of the sample IDs so that all the datasets have the same format

For now we are only looking at the "Sample ID" labels. We want to compare these to make sure that the samples match up between datasets.
The "Samples ID" labels are formated different for X1 than for X2 and X3, so we have to fix this before we can compare:

In [25]:
X1_sample_ID = []
for l in X1_labels["X1_label"][0,0]:
    padded = f"{int(l):07d}"
    with_dashes = "-".join([padded[:3],padded[3],padded[4:6],padded[6]])
    X1_sample_ID.append(with_dashes)

### Comparing the sample IDs to make sure the samples match up between datasets

In [26]:
error_count = 0
for l1,l2,l3 in zip(X1_sample_ID,X2_labels["X2_label"][0,1],X3_labels["X3_label"][0,1]):
    if (l1 != l2) or (l2 != l3):
        print(f"Sample IDs {l1}, {l2} and {l3} does not match")
        error_count += 1
if error_count > 0:
    print(f"{error_count} samples did not have maching IDs")
else:
    print("All samples have matching IDs")

All samples have matching IDs


We can also make sure that all the sample IDs are unique:

In [7]:
len(X1_sample_ID) == len(set(X1_sample_ID))

True

### Using the sample IDs as row labels for the dataframe
Because the sample IDs are a unique identifier for each sample, we can use them as labels for the sample axis in the dataframe

In [8]:
X1.set_axis(X1_sample_ID, inplace=True)
X2.set_axis(X2_labels["X2_label"][0,1], inplace=True)
X3.set_axis(X3_labels["X3_label"][0,1], inplace=True)

## Let's load the classes

In [9]:
X1_classes = sio.loadmat("../data/data-sophus/X1_class.mat")
X2_classes = sio.loadmat("../data/data-sophus/X2_class.mat")
X3_classes = sio.loadmat("../data/data-sophus/X3_class.mat")

In [20]:
print(X1_classes["X1_class"].shape)
print(X2_classes["X2_class"].shape)
print(X3_classes["X3_class"].shape)

(2, 9)
(2, 7)
(2, 7)


For now, we will only look at the "Diet" class, which is class 7 in all datasets

In [21]:
X1_diet = X1_classes["X1_class"][0,6]
X2_diet = X2_classes["X2_class"][0,6]
X3_diet = X3_classes["X3_class"][0,6]

### Comparing the sample diet classes to make sure the samples match up between datasets

In [62]:
error_count = np.sum(X1_diet != X2_diet) + np.sum(X1_diet != X3_diet)
if error_count > 0:
    print(f"{error_count} samples did not have maching diets")
else:
    print("All samples have matching diets")

All samples have matching diets
