# Visualize real data

Get to know the real data

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

E:\Users\MichaelHopwood\miniconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
E:\Users\MichaelHopwood\miniconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll


In [2]:
savefolder = 'figures_real_data'
folder = 'data'
df = pd.read_excel(os.path.join(folder, '20211128 - Full DART Data (Model & Test).xlsx'), header=2)
df.head()

Unnamed: 0,Sample,Class,Sample Types,SRN,59.00498,72.00792,74.00967,75.00427,77.00585,77.00646,...,534.22864,545.23129,548.2292,564.22822,578.23102,589.23387,592.23132,608.23108,622.23371,636.23384
0,1,c1,Model,SRN00001_DART_NHM_1,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,c1,Model,SRN00001_DART_NHM_2,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,c1,Model,SRN00001_DART_NHM_3,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,c2,Model,SRN00002_DART_NHM_1,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c2,Model,SRN00002_DART_NHM_2,0.0,0.0,0.0,5.83,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
import pylab

labeled_df = df[df['Sample Types'] == 'Model']
recol = [(float('.'.join(col.split('.')[:2])) if isinstance(col, str) else col) for col in df.columns[4:]]

unique_classes = labeled_df['Class'].unique()

NUM_COLORS = len(unique_classes)
cm = pylab.get_cmap('gist_rainbow')
cgen = (cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS))

for label in unique_classes:
    plt.figure(figsize=(12,8))
    mask = labeled_df['Class'] == label
    color = next(cgen)
    for ind, row in labeled_df.loc[mask, :].iterrows():
        data = row[df.columns[4:]].values
        data[data<25] = 0
        plt.plot(recol, data, color=color)
        plt.ylabel('%')
        plt.xlabel('Mass')
        plt.savefig(os.path.join(savefolder, f'sample_class_{label}_ind_{ind}.png'))
        #plt.show()
        plt.close()

In [None]:
# Rename numerical columns

recol = [(float('.'.join(col.split('.')[:2])) if isinstance(col, str) else col) for col in df.columns[4:]]
plt.plot(recol)

In [None]:
recol

It appears there are more than 3 measurement iterations? Why?

In [None]:
Xcols = [f'Col{i}' for i in range(len(recol))]
df.columns = list(df.columns[:4]) + Xcols
df.head(n=10)

In [None]:
df[df.columns[4:]].iloc[0:3]

## Data formatting

In [None]:
def build_data_structure(df, labels, column_name_mass_locations):
    """
    Convert dataframe which has columns of the mass locations and values of the intensities
    to array format where the sample is a list of peaks and the samples are grouped by the number of peaks.

    Input:
    df: dataframe with columns of the mass locations and values of the intensities
    	Col0  Col1  Col2 ...
    0	0.0	  0.0	0.0

    column_name_mass_locations: column names of the mass locations
    [59.00498,
     72.00792,
     74.00967,
     ...
    ]

    We split it up just in case duplicate mass locations are present since multiple scans can be represented.

    Output:
    [ [samples with smallest number of peaks],
      [samples with 2nd smallest number peaks], 
      ...,
      [samples with largest number of peaks]
    ]

    where each sample is an array of lists (mass, intensity):
    array([[peak_location1, peak_intensity1], [peak_location2, peak_intensity2], ...])
    """
    df = df.copy()
    # Get nonzero values (aka "peaks")
    data = df.apply(lambda x: np.asarray([[column_name_mass_locations[i], val] for i, (val, b) in enumerate(zip(x, x.gt(0))) if b]), axis=1)    

    X = []
    Y = []
    # Group so we have groups of the same number of peaks
    lengths = np.array([len(x) for x in data])
    unique_lengths = np.unique(lengths)
    for length in unique_lengths:
        mask_length = lengths == length
        mask_idx = np.where(mask_length)
        y = labels[mask_idx]
        x = np.stack(data.loc[mask_length].values.tolist())
        X.append(x)
        Y.append(y)
    return X, Y

labeled_df = df[df['Sample Types'] == 'Model']
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(labeled_df['Class'].unique())
X, Y = build_data_structure(labeled_df[labeled_df.columns[4:]], labeled_df['Class'].values, recol)
y = [lb.transform(y) for y in Y]
len(X), X[0].shape, len(y), y[0].shape

## Investigate test-train partition

In [None]:
df['Sample Types'].value_counts()

In [None]:
df['Class'].value_counts()

In [None]:
df[df['Sample Types'] == 'Model']['Class'].value_counts()

In [None]:
df[df['Sample Types'] == 'Test']['Class'].value_counts()

## Seeing no labels in the Test split, we must split the train for a supervised evaluation

In [None]:
labeled_df = df[df['Sample Types'] == 'Model']

X = build_data_structure(df[df.columns[4:]])

from sklearn import model_selection
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(labeled_df['Class'])

train_labeled_df, test_labeled_df = model_selection.train_test_split(labeled_df, train_size=None, shuffle=True, stratify=labeled_df['Class'])

X_train = build_data_structure(train_labeled_df[train_labeled_df.columns[4:]])
X_test = build_data_structure(test_labeled_df[test_labeled_df.columns[4:]])

y_train = lb.transform(train_labeled_df['Class'])
y_test = lb.transform(test_labeled_df['Class'])

In [None]:
print(len(X_train), len(X_test))

print(X_train[12][0].shape, y_train[0].shape, X_test[12][0].shape, y_test[0].shape)

In [None]:
from core import RealDataGenerator

gen = RealDataGenerator(
    X_train, y_train,
    X_test, y_test,
    X_test, y_test
)

