# Project Hyrcania: Automated Olive Oil Quality Control

**Objective:** To develop a machine learning system that classifies the quality and age of extra virgin olive oil using fluorescence and UV absorption spectroscopy.

This notebook documents the process of loading the data, preprocessing it using the `RamanSPy` library, and training a deep learning model with `PyTorch` to perform the classification. This project serves as a practical demonstration of building an end-to-end ML pipeline for spectroscopic analysis, mirroring the challenges faced in industrial quality control.

**Workflow:**
1.  **Setup:** Install and import necessary libraries.
2.  **Data Loading:** Load the olive oil spectroscopy dataset.
3.  **Preprocessing:** Adapt the data for `RamanSPy` and apply a cleaning pipeline.
4.  **Visualization:** Plot the spectra to observe the effects of preprocessing.
5.  **Model Training:** Build and train a 1D Convolutional Neural Network (CNN) with PyTorch.
6.  **Next Steps:** Outline the path to productionizing the model with FastAPI and Docker.

In [None]:
# Installing necessary packages using pip
!pip install pandas numpy matplotlib scikit-learn torch torcheval
!pip install ramanspy --pre

In [None]:
# Importing libraries
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torcheval.metrics import MulticlassAccuracy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Importing RamanSPy
import ramanspy as rp

print("All libraries imported successfully.")

In [None]:
# Define the path to your data directory
data_path = './data/EEMs_UV_data/'

# The dataset is organized into folders representing aging steps
aging_steps = sorted(os.listdir(data_path))

# Let's inspect the available aging steps
print(f"Available aging folders: {aging_steps}")

# We will load the UV absorption data for this classification task.
# The file is typically named 'abs1.Sample_X.csv'
all_data = []
labels = []

for i, step_folder in enumerate(aging_steps):
    folder_path = os.path.join(data_path, step_folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.startswith('abs1.'):
                file_path = os.path.join(folder_path, file_name)
                # Load the UV spectrum data
                spectrum_df = pd.read_csv(file_path, sep='\t', header=None)
                # The second column contains the absorption values
                absorption_values = spectrum_df.iloc[:, 1].values
                all_data.append(absorption_values)
                labels.append(i) # Use the folder index as the class label

# Convert to numpy arrays
X = np.array(all_data)
y = np.array(labels)

# The first column of the csv contains the wavelengths, which are our x-axis
wavelenghts = pd.read_csv(os.path.join(data_path, aging_steps[0], 'abs1.Sample_1.csv'), sep='\t', header=None).iloc[:, 0].values


print(f"Successfully loaded data.")
print(f"Shape of spectral data (X): {X.shape}")
print(f"Shape of labels (y): {y.shape}")
print(f"Number of classes (aging steps): {len(np.unique(y))}")

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 7))

# Plot one sample from the first, middle, and last aging step
indices_to_plot = [0, len(y) // 2, len(y) - 1]
labels_to_plot = ['Fresh (Class 0)', 'Mid-Aged', 'Most Aged']

for i, label_str in zip(indices_to_plot, labels_to_plot):
    ax.plot(wavelenghts, X[i, :], label=f'{label_str}')

ax.set_title('Raw UV Absorption Spectra of Olive Oil at Different Aging Stages', fontsize=16)
ax.set_xlabel('Wavelength (nm)', fontsize=12)
ax.set_ylabel('Absorption', fontsize=12)
ax.legend()
ax.grid(True)
plt.show()

In [None]:
# 1. Adapt the numpy array into a RamanSPy data container
# We treat our collection of spectra as a set of single spectra
raman_spectra = [rp.Spectrum(X[i, :], wavelenghts) for i in range(X.shape[0])]

# 2. Define the preprocessing pipeline
# This is a key step to showcase your understanding of the library
pipeline = rp.preprocessing.Pipeline([
    # ASPLS is a great algorithm for baseline correction in spectra
    rp.preprocessing.baseline.ASPLS(),
    # MinMax normalisation scales the data between 0 and 1, which is good for NN models
    rp.preprocessing.normalise.MinMax()
])

# 3. Apply the pipeline to our data
preprocessed_spectra = pipeline.apply(raman_spectra)

# 4. Convert the preprocessed data back to a numpy array for PyTorch
X_preprocessed = np.array([spec.intensity for spec in preprocessed_spectra])

print("Shape of preprocessed data:", X_preprocessed.shape)


In [None]:
fig, ax = plt.subplots(figsize=(12, 7))

# Plot the same samples as before
for i, label_str in zip(indices_to_plot, labels_to_plot):
    ax.plot(wavelenghts, X_preprocessed[i, :], label=f'{label_str} (Preprocessed)')

ax.set_title('Preprocessed UV Absorption Spectra', fontsize=16)
ax.set_xlabel('Wavelength (nm)', fontsize=12)
ax.set_ylabel('Normalized Absorption', fontsize=12)
ax.legend()
ax.grid(True)
plt.show()