In [None]:
import os
import json
import csv
import re
from datetime import datetime
import ftfy
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from PyPDF2 import PdfReader


In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract all text from a PDF file."""
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def extract_drug_slang_section(text):
    """
    Extract the part of the text that contains the "Drug Slang Terms and Code Words" section.
    It assumes that the section starts with a header (e.g. "Drug Slang Terms and Code Words")
    and continues until a marker like "See Appendix A" appears.
    """
    pattern = r"Drug Slang Terms and Code Words.*?(?=Additional Slang Terms and Code Words)"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(0)
    else:
        print("Could not find the Drug Slang section.")
        return ""

def extract_appendix_section(text):
    """
    Extract the appendix section. This assumes that the text starting from the header
    "Appendix A.  Alphabetized List of Slang Terms and Code Words For Common Drugs" up to the end.
    """
    pattern = r"Appendix A\.\s*Alphabetized List of Slang Terms and Code Words For Common Drugs(.*)"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1)
    else:
        print("Could not find the Appendix section.")
        return ""

def parse_csv1(section_text):
    """
    Parse the drug slang section to build records.
    Each record is expected to have:
         - A header line (the official drug name, which often includes parentheses)
         - Followed by one or more lines that list slang terms separated by semicolons.
    This version accounts for trailing terms where a slang term may span two lines.
    If a slang line does not end with a semicolon and the next non-blank line contains a semicolon,
    it will be merged into the current slang line. Otherwise, the line is considered complete.
    
    Returns a list of dicts with keys "index_term" and "alt_term".
    """
    lines = section_text.splitlines()
    records = []
    current_drug = None
    current_slangs = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()
        if not line:
            i += 1
            continue  # skip blank lines

        # If the line does not contain a semicolon, assume it is a new header
        if ";" not in line:
            # Before starting a new record, save the previous one (if any)
            if current_drug is not None and current_slangs:
                record = {
                    "index_term": current_drug,
                    "alt_term": "; ".join(current_slangs)
                }
                records.append(record)
            # The new header becomes the current drug
            current_drug = line
            current_slangs = []
            i += 1
        else:
            # Process a line with slang terms.
            # We want to join the line with the following one if it doesn't end with ';'
            current_line = line
            # Lookahead to merge if the line seems to be broken
            while (not current_line.rstrip().endswith(";") and (i + 1) < len(lines)):
                # Peek at the next non-blank line
                j = i + 1
                next_line = ""
                while j < len(lines) and not next_line:
                    next_line = lines[j].strip()
                    if not next_line:
                        j += 1
                # If next_line exists and looks like a slang list (contains a semicolon)
                # then join it to the current_line.
                if next_line and ";" in next_line:
                    current_line += " " + next_line
                    i = j  # skip over the merged line(s)
                else:
                    break

            # Split the merged line by semicolons and add non-empty terms to the current slang list.
            parts = [part.strip() for part in current_line.split(";") if part.strip()]
            current_slangs.extend(parts)
            i += 1

    # Add the last record if available
    if current_drug is not None and current_slangs:
        record = {
            "index_term": current_drug,
            "alt_term": "; ".join(current_slangs)
        }
        records.append(record)
    return records

def parse_csv2(appendix_text):
    """
    Parse the Appendix section where each line is in the form:
         slang_or_code_word  Associated Drug(s)
    (e.g. "7 Cocaine" or "512s Acetaminophen and Oxycodone Combination (Percocet®); Oxycodone (Oxycontin®, Roxicodone®, Oxaydo®)")
    Splits the line at the first whitespace.
    Returns a list of dicts with keys "alt_term" and "index_term".
    """
    records = []
    # Split the appendix text into lines and ignore any header lines that do not conform.
    lines = appendix_text.splitlines()
    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Use regex to split into two parts: the slang (first token) and the rest as associated drug(s)
        m = re.match(r"(\S+)\s+(.*)", line)
        if m:
            slang = m.group(1).strip()
            drug = m.group(2).strip()
            records.append({"alt_term": slang, "index_term": drug})
    return records

def write_csv(filename, records, fieldnames):
    """Write records (list of dictionaries) to a CSV file using the specified fieldnames."""
    with open(filename, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for rec in records:
            writer.writerow(rec)

def normalize(text):
    """
    Normalize text for matching: lowercases, removes common trademark symbols and punctuation.
    This helps when comparing drug names and slang terms that might include extra characters.
    """
    import string
    text = text.lower()
    text = re.sub(r"[®\(\)]", "", text)  # remove trademark and parenthesis symbols
    text = "".join(ch for ch in text if ch.isalnum() or ch.isspace())
    return text.strip()

def consistency_check(csv1_records, csv2_records):
    """
    Perform two-way consistency checks:
      1. For every slang term in CSV #1 (for each drug) verify that CSV #2 contains that slang term
         with an associated drug matching (by a normalized substring match) the CSV #1 drug.
      2. For every CSV #2 entry, verify that its associated drug (or one of the drugs if there are multiple)
         corresponds to a drug from CSV #1.
    
    Returns a tuple (passed, issues) where `passed` is True if all checks are OK
    and issues is a list of strings describing any mismatches.
    """
    # Build a mapping for CSV2: slang -> set of normalized associated drug names.
    csv2_map = {}
    for rec in csv2_records:
        slang = rec["alt_term"].lower()
        drug_norm = normalize(rec["index_term"])
        csv2_map.setdefault(slang, set()).add(drug_norm)
    
    # Build a mapping from CSV1: normalized drug name -> list of normalized slang terms.
    csv1_map = {}
    for rec in csv1_records:
        drug_norm = normalize(rec["index_term"])
        slang_list = [normalize(s) for s in rec["alt_term"].split(";")]
        csv1_map[drug_norm] = slang_list

    consistency_passed = True
    issues = []
    # Check that each slang term in CSV1 is present in CSV2 with matching drug info.
    for drug_norm, slang_list in csv1_map.items():
        for slang in slang_list:
            if slang not in csv2_map:
                consistency_passed = False
                issues.append(f"Slang term '{slang}' for drug '{drug_norm}' not found in CSV2.")
            else:
                # Check if any of the drug names in CSV2 for this slang includes the CSV1 drug name.
                if not any(drug_norm in csv2_drug or csv2_drug in drug_norm for csv2_drug in csv2_map[slang]):
                    consistency_passed = False
                    issues.append(f"Slang term '{slang}' does not have matching index term for drug '{drug_norm}' in CSV2.")
    
    # Build a set of all normalized drug names from CSV1 for later checking.
    csv1_drugs = set(csv1_map.keys())
    # For each CSV2 record, verify that each associated drug is found in CSV1.
    for rec in csv2_records:
        slang = rec["alt_term"]
        drug_field_norm = normalize(rec["index_term"])
        # In some CSV2 entries, multiple drugs may be listed separated by semicolons.
        for d in drug_field_norm.split(";"):
            d = d.strip()
            if not any(d in csv1_drug or csv1_drug in d for csv1_drug in csv1_drugs):
                consistency_passed = False
                issues.append(f"CSV2 entry for slang '{slang}' with drug '{d}' does not match any CSV1 index term.")
    return consistency_passed, issues

    


In [3]:
pdf_path = r"C:\Users\James\OneDrive\Kansas State University\CIS 830\Project_SparKG\documentation\DIR-022-18 DEA Slang Terms and Code Words  A Reference for Law Enforcement Personnel.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)

UNCLASSIFIED
UNCLASSIFIED1DEA-HOU-DIR-022-18Slang Terms and Code 
Words:  A Reference for Law 
Enforcement PersonnelDEA  
Intelligence  
BriefJuly 2018
DEA  
Intelligence  
Report

UNCLASSIFIED
UNCLASSIFIED2
DEA Intelligence ReportExecutive Summary
This Drug Enforcement Administration (DEA) Intelligence Report contains new and updated information on 
slang terms and code words from a variety of law enforcement and open sources, and serves as an updated 
version to the product entitled “Drug Slang Code Words” published by the DEA in May 2017.  It is designed 
as a ready reference for law enforcement personnel who are confronted with hundreds of slang terms and 
code words used to identify a wide variety of controlled substances, designer drugs, synthetic compounds, 
measurements, locations, weapons, and other miscellaneous terms relevant to the drug trade.  Although 
every effort was made to ensure the accuracy and completeness of the information presented, due to the 
dynamics of the e

In [None]:
drug_slang_text = extract_drug_slang_section(text)
print(drug_slang_text)

# with open('drug_slang_text.txt', 'w', encoding='utf-8') as file:
#     file.write(drug_slang_text)

"""
   I opened a text editor and deleted  some lines by hand
"""


Drug Slang Terms and Code Wordsa
Acetaminophen and Oxycodone Combination (Percocet®)
512s; Bananas; Blue; Blue Dynamite;  Blueberries; Buttons; Ercs; Greenies; Hillbilly Heroin; Kickers; M-30s; 
Paulas ; Percs; Rims; Tires; Wheels
Alprazolam (Xanax® )
Bars; Benzos; Bicycle Handle Bars; Bicycle Parts ; Bricks ; Footballs; Handlebars ; Hulk; L7; Ladders; Palitroque ; 
Planks; School Bus; Sticks; Upjohns ; White Boys ; White Girls ; Xanies; Yellow Boys ; Zanbars; Zannies; Z-Bars
Amphetamine
Acelerador ; Amy; Amps; Bam; B-Bombs; Beans; Bennies; Benz; Black and Whites; Black Beauties; Black 
Birds; Black Bombers; Black Mollies; Blacks; Blue Boys; Bombita; Brain Ticklers; Brownies; Bumblebees; 
Cartwheels; Chalk; Chicken Powder; Chochos; Chocolates ; Christina; Chunk; Co-Pilot; Coast-to-Coasts; 
Crisscross; Cross Roads; Cross Tops; Crosses ; Debs; Dexies; Diablos; Diamonds; Diet Pills; Dolls; Dominoes; 
Double Cross; Drivers; Dulces ; Fives; Flour ; Footballs; French Blues; Geeked Up ; Goofb

In [5]:
with open('drug_slang_text.txt', 'r', encoding='utf-8') as file:
    drug_slang_text = file.read()
print(drug_slang_text)


Acetaminophen and Oxycodone Combination (Percocet®)
512s; Bananas; Blue; Blue Dynamite;  Blueberries; Buttons; Ercs; Greenies; Hillbilly Heroin; Kickers; M-30s; 
Paulas ; Percs; Rims; Tires; Wheels
Alprazolam (Xanax®)
Bars; Benzos; Bicycle Handle Bars; Bicycle Parts ; Bricks ; Footballs; Handlebars ; Hulk; L7; Ladders; Palitroque ; 
Planks; School Bus; Sticks; Upjohns ; White Boys ; White Girls ; Xanies; Yellow Boys ; Zanbars; Zannies; Z-Bars
Amphetamine
Acelerador ; Amy; Amps; Bam; B-Bombs; Beans; Bennies; Benz; Black and Whites; Black Beauties; Black 
Birds; Black Bombers; Black Mollies; Blacks; Blue Boys; Bombita; Brain Ticklers; Brownies; Bumblebees; 
Cartwheels; Chalk; Chicken Powder; Chochos; Chocolates ; Christina; Chunk; Co-Pilot; Coast-to-Coasts; 
Crisscross; Cross Roads; Cross Tops; Crosses ; Debs; Dexies; Diablos; Diamonds; Diet Pills; Dolls; Dominoes; 
Double Cross; Drivers; Dulces ; Fives; Flour ; Footballs; French Blues; Geeked Up ; Goofballs; Greenies; Head 
Drugs; Hear

In [None]:
csv1_records = parse_csv1(drug_slang_text)
# write_csv("drug_slang.csv", csv1_records, ["index_term", "alt_term"])
print(csv1_records)

[{'index_term': 'Acetaminophen and Oxycodone Combination (Percocet®)', 'alt_term': '512s; Bananas; Blue; Blue Dynamite; Blueberries; Buttons; Ercs; Greenies; Hillbilly Heroin; Kickers; M-30s; Paulas; Percs; Rims; Tires; Wheels'}, {'index_term': 'Alprazolam (Xanax®)', 'alt_term': 'Bars; Benzos; Bicycle Handle Bars; Bicycle Parts; Bricks; Footballs; Handlebars; Hulk; L7; Ladders; Palitroque; Planks; School Bus; Sticks; Upjohns; White Boys; White Girls; Xanies; Yellow Boys; Zanbars; Zannies; Z-Bars'}, {'index_term': 'Amphetamine', 'alt_term': 'Acelerador; Amy; Amps; Bam; B-Bombs; Beans; Bennies; Benz; Black and Whites; Black Beauties; Black Birds; Black Bombers; Black Mollies; Blacks; Blue Boys; Bombita; Brain Ticklers; Brownies; Bumblebees; Cartwheels; Chalk; Chicken Powder; Chochos; Chocolates; Christina; Chunk; Co-Pilot; Coast-to-Coasts; Crisscross; Cross Roads; Cross Tops; Crosses; Debs; Dexies; Diablos; Diamonds; Diet Pills; Dolls; Dominoes; Double Cross; Drivers; Dulces; Fives; Flou

In [None]:
"""
    Appendix data didn't work
"""
appendix_text = extract_appendix_section(text)
print(appendix_text)



Slang/Code Word Associated Drug(s)
7 Cocaine
30s Oxycodone (Oxycontin®, Roxicodone®, Oxaydo®)
40s Oxycodone (Oxycontin®, Roxicodone®, Oxaydo®)
51s Crack Cocaine
62 Cocaine
77 Cocaine
151s Crack Cocaine
357s Hydrocodone (Norco®, Vicodin®, Lorcet®)
420 Marijuana; Synthetic Cannabinoids
501s Crack Cocaine
512sAcetaminophen and Oxycodone Combination (Percocet®); Oxycodone 
(Oxycontin®, Roxicodone®, Oxaydo®)
542 Flunitrazepam (Rohypnol®)
710 Marijuana Concentrates/Hash Oil
777 Cocaine
921 Cocaine
A-1 Cocaine
UNCLASSIFIED
UNCLASSIFIED14
DEA Intelligence ReportAbajo Heroin
Abby Amphetamine and Dextroamphetamine Combination (Adderall®)
A-Bomb Heroin mixed with Marijuana
Abyss Synthetic Cannabinoids
Abyssian Tea Khat
Acapulco Gold Marijuana
Acapulco Red Marijuana
Accordion Methamphetamine
Ace Marijuana; PCP (Phencyclidine)
Ace of Spades Synthetic Cannabinoids
Aceite LSD (Lysergic Acid Diethylamide)
Acelerador Amphetamine
Acelide LSD (Lysergic Acid Diethylamide)
Achivia Heroin
Acid LSD (Lysergi

In [None]:
csv2_records = parse_csv2(appendix_text)
write_csv("appendix_slang.csv", csv2_records, ["alt_term", "index_term"])
print(csv2_records)

In [8]:
 # Perform the consistency checks.
passed, issues = consistency_check(csv1_records, csv2_records)
if passed:
    print("Data Consistency Check Passed: All slang terms in CSV#1 are present in CSV#2 and vice versa.")
else:
    print("Data Consistency Check FAILED. Inconsistencies found:")
    for issue in issues:
        print(" -", issue)

Data Consistency Check FAILED. Inconsistencies found:
 - Slang term '512s' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'bananas' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'blue dynamite' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'buttons' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'ercs' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'hillbilly heroin' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'kickers' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'm30s' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 'rims' for drug 'acetaminophen and oxycodone combination percocet' not found in CSV2.
 - Slang term 't