In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import smelly_rats

# Loading the data

In [None]:
xl = pd.ExcelFile('../data/onions.xls')

## HNMR spectra

In [None]:
hnmr_spectra = (
    pd.read_excel(xl, sheet_name=0, index_col=0)
    .rename(columns=lambda col: col.replace("'", ""))
)
print('HNMR spectra shape:', hnmr_spectra.shape)
hnmr_spectra.head()

**Spectra range**

In [None]:
print('Spectra is ranging from (ppm)')
hnmr_spectra.index.min(), hnmr_spectra.index.max()

As set in the paper the values of the spectra between 5 and 4.50 ppm are removed. Approximately, it is actually `5.027118` to `4.598711`.

In [None]:
hnmr_spectra[5.03:5]

In [None]:
hnmr_spectra[5:4.5]

**Columns**

In [None]:
print(f'There are {len(hnmr_spectra.columns)} columns')
hnmr_spectra.columns

In [None]:
hnmr_spectra.plot(y='H2-12-')

**The following columns are duplicates, but the values (of the column) are not duplicates.**

In [None]:
double_columns = [col for col in hnmr_spectra.columns if col.startswith("H4-28")]
print(double_columns)
# [list(hnmr_spectra.columns).index(c) for c in double_columns]
hnmr_spectra[double_columns]

**Column groups**

In [None]:
from pprint import pprint
groups = (1, 2, 3, 4)
group_cols = {
    group: [c for c in hnmr_spectra.columns if c.startswith(f'H{group}')]
    for group in groups
}
pprint(group_cols)

Reproducing fig 2 of paper

In [None]:
fig, axes = plt.subplots(len(groups), 2, figsize=(10, 8))

for idx, group in enumerate(groups):
    ax = axes[idx, 0]
    hnmr_spectra[group_cols[group]].mean(axis=1).plot(ax=ax, label='')
    ax.set_ylim([0, 50_000])
    ax.set_xlim([9, 0])
    ax.set_title(f'Group {group}')
    
    ax = axes[idx, 1]
    hnmr_spectra[group_cols[group]].mean(axis=1).plot(ax=ax, label='')
    ax.set_title(f'Group {group}')
    ax.set_ylim([0, 7500])
    ax.set_xlim([9.5, 6])
    
fig.legend(loc="center right", borderaxespad=0.1)
fig.tight_layout()

## Loading the target (i.e. onion% groups)

In [None]:
target = pd.read_excel(xl, sheet_name=1, usecols=range(10, 15))
print(target.shape)
target.head()

# Pareto scaling

In [None]:
from smelly_rats import preproccessing

In [None]:
hnmr_spectra_scaled = (
    hnmr_spectra
    .apply(preproccessing.pareto_scaling)
)

In [None]:
fig, axes = plt.subplots(len(groups), 2, figsize=(10, 8))

for idx, group in enumerate(groups):
    ax = axes[idx, 0]
    hnmr_spectra_scaled[group_cols[group]].mean(axis=1).plot(ax=ax, label='')
    ax.set_ylim([0, 500])
    ax.set_xlim([9, 0])
    ax.set_title(f'Group {group}')
    
    ax = axes[idx, 1]
    hnmr_spectra_scaled[group_cols[group]].mean(axis=1).plot(ax=ax, label='')
    ax.set_title(f'Group {group}')
    ax.set_ylim([0, 500])
    ax.set_xlim([9.5, 6])
    
fig.legend(loc="center right", borderaxespad=0.1)
fig.tight_layout()

# Applying different dimentionality reduction techniques

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
from sklearn.decomposition import PCA

In [None]:
for reducer in PCA, PLSCanonical, PLSRegression, CCA:
    red = reducer(n_components=2)
    transformed = red.fit_transform(hnmr_spectra_scaled.T, target['y'])
    try: 
        hnmr_spectra_reduced, _ = transformed
    except ValueError:
        hnmr_spectra_reduced = transformed

    fig, ax = plt.subplots(figsize=(10, 8))

    symbols = ['s', 'o', '*', '<']
    for idx, cat in enumerate(sorted(target['y'].unique())):
        mask = target['y'] == cat
        x_values = hnmr_spectra_reduced[mask, 0]
        y_values = hnmr_spectra_reduced[mask, 1]
        ax.plot(x_values, y_values, symbols[idx], label=cat, ms=10)

        ax.axhline(0, linestyle='--', color='gray', alpha=0.2)
        ax.axvline(0, linestyle='--', color='gray', alpha=0.2)

        x_max = np.abs(x_values).max()
        ax.set_xlim([-x_max, x_max])
        y_max = np.abs(y_values).max()
        ax.set_ylim([-y_max, y_max])
        
    ax.set_title(reducer.__name__)
    fig.legend(loc="center right", borderaxespad=0.1)
    fig.tight_layout()

# Classification on reduced data

Only using the 2 components here. (Will probably not be accurate judging from the figures above.)

In [None]:
red = PCA(n_components=2)
hnmr_spectra_reduced = red.fit_transform(hnmr_spectra_scaled.T, target['y'])

In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

reg = linear_model.Lasso(alpha=0.1)

scores = cross_val_score(
    reg, 
    hnmr_spectra_reduced,
    target['y'], 
    cv=5
)
scores

In [None]:
from sklearn.linear_model import LogisticRegression



reg = LogisticRegression(
    penalty='l1', 
    solver='saga', 
    multi_class='multinomial', 
    C=1e-10, 
#     max_iter=1000
)

scores = cross_val_score(
    reg, 
    hnmr_spectra_reduced, 
    target['y'], 
    cv=5
)
scores

**Only convergence for small `C`**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': np.logspace(-10, -6, num=5)
}

mod = GridSearchCV(
    reg,
    param_grid,
    cv=5,
    return_train_score=True,
    iid=False,
)

mod.fit(hnmr_spectra_reduced, target['y'])