<a href="https://colab.research.google.com/github/MutonyiLewis/Mes-Project/blob/main/Extract_Phytochemical_Data_from_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install PyMuPDF unidecode fuzzywuzzy pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-p

#### Extract raw data from PDF

In [14]:
import pdfplumber
import pandas as pd
from unidecode import unidecode


pdf_path = '/content/FF Components and their Benefits.pdf'

records = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            for row in table:
                if not any(row): continue
                if "bioactive" in str(row[0]).lower(): continue
                if len(row) < 2: continue  # skip malformed rows

                molecule = unidecode(str(row[0])).strip()
                source = unidecode(str(row[1])).strip() if len(row) > 1 else ""
                benefits = unidecode(str(row[2])).strip() if len(row) > 2 else ""
                intake = unidecode(str(row[3])).strip() if len(row) > 3 else ""

                records.append({
                    "name": molecule,
                    "category": None,
                    "food_sources": source,
                    "benefits": benefits,
                    "intake_example": intake
                })

## Backfill categories

In [None]:
category_map = {
    'Lycopene': 'Carotenoids',
    'Flavonoids': 'Carotenoids',
    'Luteolin': 'Bioactive Carbonyls',
    'Apigenin': 'Bioactive Carbonyls',
    'Tangeritin': 'Bioactive Carbonyls',
    'Kaempferol': 'Bioactive Carbonyls',
    'Galangin': 'Bioactive Carbonyls',
    'Hesperitin': 'Bioactive Carbonyls',
    'Silymarin': 'Bioactive Carbonyls',
    'Epigallocatechin-3-gallate Epicatechin': 'Bioactive Carbonyls',
    'Curcumin': 'Bioactive Carbonyls',
    'Resveratrol/Piceatannol': 'Bioactive Carbonyls',
    'Polyphenols Ellagin acid Ellagitannins': 'Bioactive Carbonyls',
    'Soy isoflavones, Genistein, Daidzein,glyceollins': 'Bioactive Carbonyls',
    'Omega-3 fatty acids': 'Bioactive Carbonyls',
    'Omega-3 fatty acids,Lignans': 'Bioactive Carbonyls',
    'Alicin,DAS,DADS,Ajoene': 'Bioactive Carbonyls',
    'Gingerols,Shoagols,Zerumbone': 'Bioactive Carbonyls',
    'Insoluble fiber': 'Dietary (functional and total) Fiber',
    'Soluble fiber': 'Dietary (functional and total) Fiber',
    'Whole grains': 'Dietary (functional and total) Fiber',
    'Selenium': 'Minerals'
}

# Apply map
for r in records:
  r['category'] = category_map.get(r['name'], 'UNKNOWN')

## Build DataFrame and Add ID

In [15]:
df = pd.DataFrame(records)
df["phyto_id"] = range(1, len(df) + 1)

# Optional: Clean up entries
df["food_sources"] = df["food_sources"].str.replace(r"\n", ", ", regex=True)
df["benefits"] = df["benefits"].str.replace(r"\n", " ", regex=True)


In [17]:
df.head(20)

Unnamed: 0,name,category,food_sources,benefits,intake_example,phyto_id
0,CancerPrevention,,,,,1
1,Carotenoids,,,,,2
2,Lycopene,,"tomatoes,, processedtomato, products,, waterme...",Prostatecancer -reducesoxidativestress through...,-consumptionof160gm/d\ntomatosaucedecreasesLDL...,3
3,"Flavonoids(quercetin,\nkaempferol,rutin),\nphe...",,"Stinging nettle, (Urticadioica)",Flavonoidshave antioxidantandanti- inflammator...,,4
4,Luteolin,,"Broccoli, green, pepper, parsley,, oregano, ca...",Anticarcinogenicactivity,,5
5,Apigenin,,"Many fruits and, vegetables, parsley,, celery,...",Anticarcinogenicactivity,,6
6,Tangeritin,,Citruspeels,Anticarcinogenicactivity,,7
7,Kaempferol,,"Apples, potatoes,, onions, broccoli,, brussels...",Anticarcinogenicactivity,,8
8,Galangin,,Proposis,Anticarcinogenicactivity,,9
9,Hesperitin,,Citrusfruits,Anticarcinogenicactivity,,10


##  Save the Final Table


In [18]:
df.to_csv("phytochemicals.csv", index=False)

## Split into Phytochemicl blocks

In [8]:
import re

full_text = re.sub(r'\n{2,}', '\n', full_text)

# Split where a new section likely starts with a title (capitalized)
blocks = re.split(r"\n(?=[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s*)", full_text)
blocks = [b.strip() for b in blocks if b.strip()]  # Remove empty blocks

In [11]:
print(blocks)

['1 | P a g e', 'Functional Foods Components and their Benefits', 'Cancer Prevention', 'Bioactive molecules', 'Source(s)', 'Potential Benefits(s)', 'Carotenoids', 'Lycopene\ntomatoes,\nprocessed tomato\nproducts,\nwatermelon,\nred/pink\ngrapefruit', 'Prostate cancer\n- reduces oxidative stress\nthrough modulation of\nantioxidant defence\nsystem', 'Skin cancer\n- increases nuclear NF-\nE2-related factor-2', 'Colon cancer\n- inhibit DNA damage\n- down regulation of\ncyclin D1, Bcl-2 and', 'Bcl-xL expression\n-consumption of 160 gm/d\ntomato sauce decreases LDL\ncholesterol', 'Flavonoids (quercetin,\nkaempferol, rutin),\nphenolic compounds,\norganic acids, vitamins,\nminerals', 'Stinging\nnettle\n(Urtica dioica)\n\uf06c', 'Flavonoids have\nantioxidant and anti-\ninflammatory\nproperties that may\nlimit oxidative\ndamage responsible\nfor some chronic\ndiseases such as\ncancer\n\uf06c\nhas anti-tumor\nactivity against\nprostate cancer\n\uf06c', 'Bioactive Carbonyls', 'Luteolin', 'Broccoli,\

##Parse each block

In [None]:
import pandas as pd

phyto_data = []

for block in blocks:
  lines = block.strip().split('\n')

  # First line prob the compound
  name = lines[0].strip().split()[0]

  #