In [3]:
import os, re, unicodedata, ftfy, pandas as pd
from langchain.document_loaders import PDFMinerLoader
from tqdm.auto import tqdm

def clean_metadata(text: str) -> str:
    text = ftfy.fix_text(text)
    text = unicodedata.normalize("NFC", text)
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    return text

def clean_text(text: str) -> str:
    text = ftfy.fix_text(text)
    text = unicodedata.normalize("NFC", text)
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    return re.sub(r"\s+", " ", text).strip()

def resolve_pdf_path(base_dir, filename):
    return os.path.join(base_dir, "pdf", filename)

df = pd.read_csv("metadata.csv")
# Clean every textual column in-place
for col in [c for c in df.columns if df[c].dtype == "object"]:
    df[col] = df[col].astype(str).map(clean_metadata)

i = 0
records = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="PDF→pages"):
    pdf_path = resolve_pdf_path(row["source_dir"], row["PDF Filename"])
    if not os.path.isfile(pdf_path):
        continue

    fulltext = ""
    for p_idx, page in enumerate(PDFMinerLoader(pdf_path).load(), 1):
        fulltext += page.page_content
    records.append({
        **row.to_dict(),
        "fulltext": clean_text(fulltext)
    })

    i+=1

    if i == 10:
        break



PDF→pages:   0%|          | 9/2504 [00:49<3:47:14,  5.46s/it]


In [4]:

page_df = pd.DataFrame.from_records(records)

In [8]:
page_df = page_df.rename(columns= {'Unnamed: 0': "ID"})

In [9]:
page_df

Unnamed: 0,ID,Title,Summary,Publication Date,Publication Type,Article URL,Article Filename,PDF URL,PDF Filename,office,source_dir,fulltext
0,0,E-kerosene providers call upon the German gove...,"T&E, EDL, Norsk e-fuel, Arcadia e-fuels, Caphe...","Sep 16, 2024, 12:00:00 AM",Letter,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_e-kerose...,https://www.transportenvironment.org/uploads/f...,e_kerosene_providers_call_upon_the_german_gove...,Brussels,./scraping/brussels,Open letter: Retaining Germany's national e-ke...
1,1,E-kerosene providers call upon the German gove...,"T&E, EDL, Norsk e-fuel, Arcadia e-fuels, Caphe...","Sep 16, 2024, 12:00:00 AM",Letter,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_e-kerose...,https://www.transportenvironment.org/uploads/f...,e_kerosene_providers_call_upon_the_german_gove...,Brussels,./scraping/brussels,"Prof. Dr. E. Pache, Domerschulstraße 16, D-970..."
2,2,Industry and NGOs calls for swift adoption of ...,T&E and three industry associations call on th...,"Sep 11, 2024, 2:13:00 PM",Letter,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_industry...,https://www.transportenvironment.org/uploads/f...,industry_and_ngos_calls_for_swift_adoption_of_...,Brussels,./scraping/brussels,To: Hungarian Presidency of the Council of the...
3,3,What Strategic Projects to select,T&E recommendations on the implementation of t...,"Sep 9, 2024, 11:00:00 AM",Briefing,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_what-str...,https://www.transportenvironment.org/uploads/f...,what_strategic_projects_to_select_briefing_wha...,Brussels,./scraping/brussels,BRIEFING - September 2024What Strategic Projec...
4,4,The corporate sector continues to lag behind p...,The EU's corporate car market stagnation is ex...,"Sep 3, 2024, 10:23:00 AM",Press Release,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_the-corp...,https://www.transportenvironment.org/uploads/f...,the_corporate_sector_continues_to_lag_behind_p...,Brussels,./scraping/brussels,Unveiling Europe's corporatecar problemHow the...
5,5,Mobility policy needs systems thinking,System thinking is badly needed in mobility po...,"Sep 3, 2024, 12:00:00 AM",Opinion,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_mobility...,https://mahb.stanford.edu/wp-content/uploads/2...,mobility_policy_needs_systems_thinking_yale_pu...,Brussels,./scraping/brussels,DOI: 10.1111/jiec.13084R E S E A R C H A N D A...
6,6,Mobility policy needs systems thinking,System thinking is badly needed in mobility po...,"Sep 3, 2024, 12:00:00 AM",Opinion,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_mobility...,https://docs.iza.org/dp16855.pdf,mobility_policy_needs_systems_thinking_dp16855...,Brussels,./scraping/brussels,DISCUSSION PAPER SERIESIZA DP No. 16855Working...
7,7,Mobility policy needs systems thinking,System thinking is badly needed in mobility po...,"Sep 3, 2024, 12:00:00 AM",Opinion,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_mobility...,https://113.wpcdnnode.com/ariebleijenberg.nl/w...,mobility_policy_needs_systems_thinking_transpo...,Brussels,./scraping/brussels,Pagina 1 van 16 The transport-urbanisation dia...
8,8,How EU states can tackle unsustainable biofuel...,RED III implementation for Europe's member states,"Sep 2, 2024, 5:00:00 PM",Briefing,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_how-eu-s...,https://www.transportenvironment.org/uploads/f...,how_eu_states_can_tackle_unsustainable_biofuel...,Brussels,./scraping/brussels,BRIEFING - September 2024RED III implementatio...
9,9,Mining waste: time for the EU to clean up,A new legal analysis makes the case for a revi...,"Aug 26, 2024, 12:00:00 AM",Briefing,https://www.transportenvironment.org/articles/...,www.transportenvironment.org_articles_mining-w...,https://www.transportenvironment.org/uploads/f...,mining_waste_time_for_the_eu_to_clean_up_brief...,Brussels,./scraping/brussels,BRIEFING - AUGUST 2024Mining waste: time for t...


In [10]:

page_df.to_parquet("output/metadata_with_fulltext.parquet", index=False)
page_df.to_csv("output/metadata_with_fulltext.csv", index=False)

In [17]:
import hashlib, os, json, faiss, pandas as pd
from tqdm.auto import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

### CONFIG ###
PARQUET_PATH   = "output/metadata_with_fulltext.parquet"           # produced in the previous step
INDEX_OUT      = "output/docs.index"
MAP_OUT        = "output/faiss_mapping.parquet"
MODEL_NAME     = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_SIZE     = 1024                       # characters
CHUNK_OVERLAP  = 64
################

# 1. Load pages
pages = pd.read_parquet(PARQUET_PATH)

# 2. Prepare splitter & embedder
splitter  = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
                                           chunk_overlap=CHUNK_OVERLAP,
                                            add_start_index  = True)        # <-- NEW)
embedder  = HuggingFaceEmbeddings(model_name=MODEL_NAME)


In [18]:
# 3. Build FAISS index progressively
print(embedder.embed_query("test"))

[0.01157346274703741, 0.0251361932605505, -0.03670189157128334, 0.059324879199266434, -0.0071490490809082985, -0.04119426757097244, 0.07708740234375, 0.03744257986545563, 0.012449024245142937, -0.00611764844506979, 0.017034266144037247, -0.07701534777879715, -0.00039417133666574955, 0.027909060940146446, -0.015989137813448906, -0.06827525049448013, 0.008884693495929241, -0.020280746743083, -0.08035992830991745, -0.013074045069515705, -0.04109999164938927, -0.02589806169271469, -0.026538675650954247, 0.03305229917168617, -0.022079166024923325, 0.021046113222837448, -0.05792200192809105, 0.03294878453016281, 0.02970738708972931, -0.06224842742085457, 0.03878801316022873, 0.031990695744752884, 0.015330800786614418, 0.0453069694340229, 0.05314943194389343, 0.013360688462853432, 0.04122495651245117, 0.028142889961600304, 0.019398432224988937, -0.003252344438806176, -0.0036123283207416534, -0.1428602635860443, 0.03807120397686958, -0.010916150175035, 0.026094019412994385, 0.04136991873383522

In [None]:
from langchain.schema import Document   # if you need an empty shell
import numpy as np
dim    = len(embedder.embed_query("test"))   # vector length
index  = faiss.IndexFlatL2(dim)                  # no-frills L2 index
mapping_rows = []
seen_hashes  = set()

vec_id = 0
for _, row in tqdm(pages.iterrows(), total=len(pages), desc="chunk→embed"):
    page_text = Document(page_content=row["fulltext"], metadata={})
    # 3.1 Split page into overlapping chunks
    chunks = splitter.split_documents([page_text])

    for doc in chunks:

        chunk     = doc.page_content
        start     = doc.metadata["start_index"]
        end       = start + len(chunk)

        # 3.2 Deduplicate by content hash
        h = hashlib.sha256(chunk.encode("utf-8")).hexdigest()
        if h in seen_hashes:          # already embedded identical text
            continue
        seen_hashes.add(h)


        # 3.3 Embed & add to FAISS
        vec = embedder.embed_documents([chunk])
        index.add(np.array(vec))

        # 3.4 Remember how to find this chunk again
        mapping_rows.append({
            "vector_id"  : vec_id,
            "doc_id"     : row["ID"],   # or your own identifier
            "start_char" : start,
            "end_char"   : end
        })
        vec_id += 1

# 4. Persist
faiss.write_index(index, INDEX_OUT)
pd.DataFrame(mapping_rows).to_parquet(MAP_OUT, index=False)

print(f"✅  Index with {index.ntotal:,} unique chunks written to {INDEX_OUT}")
print(f"✅  Mapping file with {len(mapping_rows):,} rows written to {MAP_OUT}")


chunk→embed:   0%|          | 0/10 [00:00<?, ?it/s]

chunk→embed: 100%|██████████| 10/10 [00:40<00:00,  4.10s/it]

✅  Index with 340 unique chunks written to output/docs.index
✅  Mapping file with 340 rows written to output/faiss_mapping.parquet





In [None]:
# ── 0.  pip installs ──────────────────────────────────────────────
# pip install -qU faiss-cpu pandas pyarrow langchain sentence-transformers

# ── 1.  imports & paths ───────────────────────────────────────────
import pandas as pd
from pathlib import Path
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS               # wrapper


# ── 2.  load everything back ──────────────────────────────────────
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

vectorstore  = FAISS.load_local(INDEX_OUT, embeddings)
index       = vectorstore.index                              # native faiss.Index

mapping = pd.read_parquet(MAP_OUT).set_index("vector_id")
pages   = (pd.read_parquet(PARQUET_PATH)
             .set_index("ID")                    # “ID” column in your mapping
           )

# ── 3.  helper: turn query → top-k chunks  ────────────────────────
def search(query: str, k: int = 5):
    q_vec = embeddings.embed_query(query)
    D, I  = index.search(q_vec, k)               # I = [vec_ids],  D = L2 distances

    # gather rows that tell us where each vector came from
    hits = mapping.loc[I[0]].copy()
    hits["score"] = D[0]

    # slice the original full-text back into the exact chunk
    def extract_chunk(row):
        text = pages.loc[row["doc_id"], "fulltext"]
        return text[row["start_char"] : row["end_char"]]

    hits["chunk_text"] = hits.apply(extract_chunk, axis=1)
    return hits[["score", "doc_id", "chunk_text"]]

# ── 4.  example run  ───────────────────────────────────────────────
df = search("hydrogen refuelling stations in Belgium", k=3)
print(df.to_markdown(index=False, tablefmt="github"))


AttributeError: 'list' object has no attribute 'shape'