In [None]:
import numpy as np
import matplotlib.pyplot as plt

# (Getting Started) Working with Data

## Example: Peak finding in Raman Spectroscopy (Kitchin 14.1)

Raman spectroscopy is technique typically used to determine the vibrational models of a molecule or material, and provides a structural fingerprint by which molecules may be identified. The data reported by the instrument typically is stored as Intensity vs. Wavenumber, and is discrete rather than continuous. In order to characterize a molecule, one must determine the location of peaks in this data. Here, we will use data interpolation and fitting via spline smoothing to construct a continuous representation of the data and use this to determine peaks.

### Step 1: Downloading the data

In [None]:
import os
os.system("wget https://raw.githubusercontent.com/jkitchin/pycse/master/data/raman.txt")

In [None]:
import io
import requests
r = requests.get('https://raw.githubusercontent.com/jkitchin/pycse/master/data/raman.txt')
raman = io.StringIO(r.text)
print(raman)

### Step 2: Loading the data

In [None]:
data = np.loadtxt('raman.txt')
print(data.shape)
print(data)

In [None]:
rdata = np.loadtxt(raman)
print(rdata.shape)
print(rdata)

In [None]:
wn = data[:, 0]
intens = data[:, 1]
plt.plot(wn, intens)
plt.xlabel("Wavenumber (cm$^{-1}$)")
plt.ylabel("Intensity (counts)")

### Step 3: Select region of interest

In [None]:
ind = (wn > 1200) & (wn < 1300)
w1 = wn[ind]
i1 = intens[ind]

plt.figure()
plt.plot(w1, i1, 'b. ')
plt.xlabel('Wavenumber (cm$^{-1}$)')
plt.ylabel('Intensity (counts)')

### Step 4: Fit spline function

In [None]:
from scipy.interpolate import UnivariateSpline

# s is a "smoothing" factor
sp = UnivariateSpline(w1, i1, k=4, s=2000)

plt.plot(w1, i1, 'b. ')
plt.plot(w1, sp(w1), 'r-')
plt.xlabel('Raman shift (cm$^{-1}$)')
plt.ylabel('Intensity (counts)')

In [None]:
# get the first derivative evaluated at all the points
d1s = sp.derivative()

d1 = d1s(w1)

# we can get the roots directly here, which correspond to minima and
# maxima.
#print('Roots = {}'.format(sp.derivative().roots()))
minmax = sp.derivative().roots()

plt.clf()
plt.plot(w1, d1, label='first derivative')
plt.xlabel('Raman shift (cm$^{-1}$)')
plt.ylabel('First derivative')
plt.grid()

plt.figure()
plt.plot(minmax, d1s(minmax), 'ro ', label='zeros')
plt.legend(loc='best')

plt.plot(w1, i1, 'b. ')
plt.plot(w1, sp(w1), 'r-')
plt.xlabel('Raman shift (cm$^{-1}$)')
plt.ylabel('Intensity (counts)')
plt.plot(minmax, sp(minmax), 'ro ')

In [None]:
ind = (wn > 1241) & (wn < 1259)
w1 = wn[ind]
i1 = intens[ind]
sp = UnivariateSpline(w1, i1, k=4, s=2)
d1s = sp.derivative()
d1 = d1s(w1)

print('Roots = {}'.format(sp.derivative().roots()))
minmax = sp.derivative().roots()

plt.clf()
plt.plot(w1, d1, label='first derivative')
plt.xlabel('Raman shift (cm$^{-1}$)')
plt.ylabel('First derivative')
plt.grid()

plt.figure()
plt.plot(minmax, d1s(minmax), 'ro ', label='zeros')
plt.legend(loc='best')

plt.plot(w1, i1, 'b. ')
plt.plot(w1, sp(w1), 'r-')
plt.xlabel('Raman shift (cm$^{-1}$)')
plt.ylabel('Intensity (counts)')
plt.plot(minmax, sp(minmax), 'ro ')

## Exercise: Peak Finding in Raman Spectroscopy

Determine the position of the peak between 745 $cm^{-1}$ and 760 $cm^{-1}$

## Example: Curve fitting in Gas Chromatography (Kitchin 14.2)

We have a text file that contains data from a gas chromatograph with two peaks that overlap. We want the area under each peak to estimate the gas composition. You will see how to read the text file in, parse it to get the data for plotting and analysis, and then how to fit it. 

In [None]:
os.system("wget https://raw.githubusercontent.com/jkitchin/pycse/master/data/gc-data-21.txt")
with open('gc-data-21.txt', 'r') as f:
  data = f.readlines()

print("".join(data[:100]))

### Step 2: Load the data

In [None]:
for i,line in enumerate(data):
    if '# of Points' in line:
        npoints = int(line.split()[-1])
    elif 'R.Time\tIntensity' in line:
        i += 1
        break

# now get the data
t, intensity = [], []
for j in range(i, i + npoints):
    fields = data[j].split()
    t += [float(fields[0])]
    intensity += [int(fields[1])]

t = np.array(t)
intensity = np.array(intensity, np.float)

# now plot the data in the relevant time frame
plt.plot(t, intensity)
plt.xlim([4, 6])
plt.xlabel('Time (s)')
plt.ylabel('Intensity (arb. units)')

### Step 3: Baseline correction

In [None]:
intensity -= np.mean(intensity[(t > 4.0) & (t < 4.4)])
plt.figure()
plt.plot(t, intensity)
plt.xlim([4, 6])
plt.xlabel('Time (s)')
plt.ylabel('Intensity (arb. units)')

### Step 4: Define fit function

In [None]:
from scipy.special import erf

def asym_peak(t, pars):
    'from Anal. Chem. 1994, 66, 1294-1301'
    a0 = pars[0]  # peak area
    a1 = pars[1]  # elution time
    a2 = pars[2]  # width of gaussian
    a3 = pars[3]  # exponential damping term
    f = (a0/2/a3*np.exp(a2**2/2.0/a3**2 + (a1 - t)/a3)
         *(erf((t-a1)/(np.sqrt(2.0)*a2) - a2/np.sqrt(2.0)/a3) + 1.0))
    return f

In [None]:


def two_peaks(t, *pars):
    'function of two overlapping peaks'
    a10 = pars[0]  # peak area
    a11 = pars[1]  # elution time
    a12 = pars[2]  # width of gaussian
    a13 = pars[3]  # exponential damping term
    a20 = pars[4]  # peak area
    a21 = pars[5]  # elution time
    a22 = pars[6]  # width of gaussian
    a23 = pars[7]  # exponential damping term
    p1 = asym_peak(t, [a10, a11, a12, a13])
    p2 = asym_peak(t, [a20, a21, a22, a23])
    return p1 + p2

### Step 5: Construct initial guess and call `curve_fit`

In [None]:
from scipy.optimize import curve_fit

parguess = (1500, 4.85, 0.05, 0.05, 5000, 5.1, 0.05, 0.1)
popt, pcov = curve_fit(two_peaks, t, intensity, parguess)
print(popt)
plt.plot(t, intensity)
plt.plot(t, two_peaks(t, *popt), 'g-')
plt.xlim([4, 6])
plt.legend(['data', 'final fit'])

In [None]:
pars1 = popt[0:4]
pars2 = popt[4:8]

peak1 = asym_peak(t, pars1)
peak2 = asym_peak(t, pars2)

plt.figure()
plt.plot(t, intensity)
plt.plot(t, peak1, 'r-')
plt.plot(t, peak2, 'g-')
plt.xlim([4, 6])
plt.xlabel('Time (s)')
plt.ylabel('Intensity (arb. units)')
plt.legend(['data', 'peak 1', 'peak 2'])

## Exercise: Area Under the Curve

Find the area under peak 1 and peak 2. Take the ratio of each peak to the total area under the curve to determine the composition of the gas.