In [2]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import sys
import datetime
import re

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from src.data.process_data import process_data
from config import CFG
CFG = CFG()

In [5]:
date = datetime.datetime(2023, 2, 10).strftime('%Y-%m-%d')
current_dir = os.path.abspath(os.path.dirname(os.getcwd()))
data_dir = os.path.join(current_dir, "data_files")
raw_data = os.path.join(data_dir, "raw")
processed_data = os.path.join(data_dir, "processed")

CFG.RAW_DATA_PATH = raw_data
CFG.PROCESSED_DATA_PATH = processed_data
CFG.DATE = date

In [6]:
df = pd.read_csv(os.path.join(CFG.PROCESSED_DATA_PATH, f"df-processed-{CFG.DATE}.csv"))

In [7]:
# df = df.replace({None: np.nan}, regex=False)

opioid = df[(df['category'] == 'Opioid') | (df['expected drug'].isin([
    'Fentanyl and Methamphetamine', 'Down and Methamphetamine', 'Heroin and Methamphetamine', 'Heroin and Cocaine'
])) ].index
df['expected opioid'] = df.index.isin(opioid).astype(int)
df

Unnamed: 0,visit date,city,site,expected drug,category,colour,texture,fentanyl strip,benzo strip,ftir component 0,ftir component 1,ftir component 2,ftir component 3,ftir component 4,ftir component 5,expected opioid
0,2023-01-31,Penticton,Fairhaven,Down (Unknown Opioid),Opioid,Purple,Chunk,1,-1,Fentanyl,Erythritol,Caffeine,,,,1
1,2023-01-31,Vancouver,Get Your Drugs Tested,Unknown,Unknown,Brown (light),Chunk,1,-1,Uncertain match,Fentanyl,,,,,0
2,2023-01-31,Vancouver,Get Your Drugs Tested,Alprazolam,Depressant,Green (light),Chunk,-1,1,Flualprazolam,Microcrystalline cellulose,,,,,0
3,2023-01-31,Nanaimo,CMHA,Down (Unknown Opioid),Opioid,Pink,Powder,1,1,Fentanyl,Mannitol,Caffeine,Bromazolam,,,1
4,2023-01-31,Cranbrook,ANKORS (Cranbrook),Methamphetamine,Stimulant,Colourless,Crystal,-1,0,Methamphetamine,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2018-12-03,Vancouver,Insite,MDMA,Psychedelic,Black,Granules,0,0,Uncertain match,Phenacetin,,,,,0
59996,2018-12-03,Vancouver,Insite,Fentanyl,Opioid,Purple,Granules,1,0,Caffeine,Inositol,Heroin hcl,Fentanyl,,,1
59997,2018-12-03,Vancouver,Getaway,Unknown,Unknown,Colourless,Crystal,-1,0,Methamphetamine,,,,,,0
59998,2018-12-03,Vancouver,Getaway,Fentanyl,Opioid,Green,Pebble,1,0,Fentanyl,Caffeine,Mannitol,Mannitol,Caffeine,Fentanyl,1


# Matching FTIR and Fentanyl Strip
- Now lets investigate if the results from the FTIR and fentanyl strip match

In [8]:
def opioid_present(col):
    """
    This function uses a list of regex strings to find if any opioids are present in a given column (ftir components).
    """
    opioid_list = ['(despropionyl\s)?(para-fluoro)?(carf|f)entan(i|y)l( base)?', 
                   'heroin.*', 
                   '(hydro|oxy).*one', 
                   'opium', 
                   'w-1(8|9)', 
                   '6-mam', 
                   '.*tazene', 
                   '(code|morph)ine',
                   '(furanyl\s)?uf-17']
    if col is None:
        return 0
    for item in opioid_list:
        return 1 if re.search(item, col, re.IGNORECASE) else 0

In [12]:
ftirs = [x for x in df.columns.tolist() if x.startswith('ftir')]
for col in ftirs:
    df[col] = df[col].astype(str)

opioids = df.loc[:, [f'ftir component {i}' for i in range(6)]].applymap(lambda x: opioid_present(x))
opioids = opioids.apply(sum, axis=1).astype(bool).astype(int)
df['contains_opioids'] = opioids

incorrect_strip = df[(df['contains_opioids'] == 1) & (df['fentanyl strip'] == -1)]
incorrect_ftir = df[(df['contains_opioids'] == -1) & (df['fentanyl strip'] == 1)]
len(incorrect_strip), len(incorrect_ftir)

(31, 0)

- this is very surprising!
- the strips use something called an immunoassay, which uses antibodies to detect the fentanyl.
- antibodies are extremely specific and sensitive, more so than the FTIR spec
- lets look a little closer

In [13]:
incorrect_strip.loc[:, [f'ftir component {i}' for i in range(6)]]

Unnamed: 0,ftir component 0,ftir component 1,ftir component 2,ftir component 3,ftir component 4,ftir component 5
2086,Erythritol,Fentanyl,,,,
2463,Caffeine,Fentanyl,Erythritol,,,
6794,Caffeine,Fentanyl,,,,
7934,Caffeine,Erythritol,Fentanyl,,,
17815,Mannitol,Fentanyl,Caffeine,,,
22214,Fentanyl,Uncertain match,,,,
27122,Fentanyl,,,,,
33625,Caffeine,Fentanyl,Uncertain carbohydrate,,,
38128,Propionanilide,Uncertain match,Fentanyl,,,
47286,Caffeine,Mannitol,Fentanyl,,,


- They are all the generic fentanyl, not even the more potent carfent.
- Possibly human error, lets see how spread out the sites are

In [14]:
ratio = len(df) / len(incorrect_strip)

test = incorrect_strip['site'].value_counts()

baseline = df['site'].value_counts() / ratio
baseline = baseline[baseline.index.isin(test.index)]

test / baseline

ASK Wellness (433 Tranquille)    4.19
Abbotsford Hub                  18.43
Get Your Drugs Tested            0.50
Insite                           4.72
Molson OPS                       3.22
Outreach Urban Health            8.20
POUNDS                           5.94
Name: site, dtype: float64

- We will only use Insite and Get Your Drugs Tested since the sample size is a bit more significant. 
- So, there is almost 5 times more samples from Insite with incorrect FTIR than there should be, according to the baseline
- We should also take a look at the dates to see if they are spread out or clumped (maybe new employee or batch of strips)

In [16]:
def format_site(row) -> list[str]:
    if row['site'] == 'ASK Wellness (433 Tranquille)':
        return ['background-color: #C3ACD0; color: white'] * len(row)
    elif row['site'] == 'Get Your Drugs Tested':
        return ['background-color: #7286D3; color: white'] * len(row)
    elif row['site'] == 'Insite':
        return ['background-color: #645CBB; color: white'] * len(row)
    else:
        return ['background-color: #8EA7E9; color: white'] * len(row)

def format_df(styler):
    styler.set_caption('Days Between Incorrect Strip Results per Site')
    styler.format(formatter={
        'log_per_site_diff': '{:.2f}',
        'per_site_diff': '{:.0f}',
        'visit date': '{:%Y-%m-%d}'
        })
    styler.apply(format_site, axis=1)  #
    styler.background_gradient(cmap='Reds', axis=0, subset=['log_per_site_diff'])
    return styler

In [17]:
incorrect_strip['per_site_diff'] = incorrect_strip.groupby(['site'])['visit date'].diff(-1)
incorrect_strip['per_site_diff'] = incorrect_strip['per_site_diff'].dt.days

incorrect_strip_df = incorrect_strip[['visit date', 'site', 'per_site_diff']].dropna()
incorrect_strip_df['log_per_site_diff'] = (incorrect_strip_df['per_site_diff'] + 1).apply(np.log)
incorrect_strip_df.style.pipe(format_df)

Unnamed: 0,visit date,site,per_site_diff,log_per_site_diff
2463,2023-01-25,ASK Wellness (433 Tranquille),1197,7.09
6794,2022-10-26,Get Your Drugs Tested,135,4.91
17815,2022-06-13,Get Your Drugs Tested,66,4.2
22214,2022-04-08,Get Your Drugs Tested,82,4.42
27122,2022-01-16,Get Your Drugs Tested,138,4.93
33625,2021-08-31,Get Your Drugs Tested,89,4.5
38128,2021-06-03,Get Your Drugs Tested,222,5.41
47286,2020-10-24,Get Your Drugs Tested,91,4.52
50152,2020-07-25,Get Your Drugs Tested,28,3.37
50878,2020-06-27,Get Your Drugs Tested,118,4.78


- It does look like there was human error due to the clumping nature
- Further, there is almost a weekly pattern, which could be when that employee was working and the ceasing of it could be them quitting
- Get Your Drugs Tested results are so spaced out and erratic if it was human error, it is more of a one-off