## Imports

In [None]:
import os

import pandas as pd

from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.config import config

## Setup

In [None]:
pd.set_option('mode.copy_on_write', True)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 5)

## Load data

In [None]:
ground_truth_df = pd.read_csv(config.GROUND_TRUTH_DATASET_PATH)
search_engine_results_df = pd.read_csv(config.SEARCH_ENGINE_RESULTS_DATASET_PATH)

## Clean data

Remove ids with no evidence from ground_truth_df

In [None]:
ids = ground_truth_df.statement_id

bad_ids = []
for id in ids:
    df = search_engine_results_df[search_engine_results_df.fact_check_id == id]
    if df.empty:
        bad_ids.append(id)

clean_ground_truth_df = ground_truth_df[~ground_truth_df.statement_id.isin(bad_ids)]

## Build vector store

Collect unique uuids of cached url files

In [None]:
ids = clean_ground_truth_df.statement_id

uuids = []
for id in ids:
    # Get matching uuids
    df = search_engine_results_df
    df = df[df.fact_check_id == id]
    df = df['result_uuid']
    if len(df) < 10:
        print(f"WARN less than 10 urls for ID: {id}")
    # Collect uuids
    uuids.append(df)

uuids_series = pd.concat(uuids, ignore_index=True)
uuids_series.drop_duplicates(inplace=True)
print(f"Unique uuids: {len(uuids_series)}")

Collect docs for the vector store

In [None]:
result_docs = []

for uuid in uuids_series:
    # Load from HTML file
    html_loader = BSHTMLLoader(os.path.join(config.CACHED_URLS_PATH, uuid + '.html'))
    docs = html_loader.load()

    # Split
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config.TEXT_SPLITTER_CHUNK_SIZE,
        chunk_overlap=config.TEXT_SPLITTER_CHUNK_OVERLAP
    )
    docs = text_splitter.split_documents(docs)

    # Add to result
    result_docs += docs

Build the vector store

In [None]:
vector_store = FAISS.from_documents(result_docs, config.get_embeddings())

## Save clean data and vector store

Save vector store

In [None]:
output_folder = config.ALL_EVIDENCE_VECTOR_STORE_PATH
vector_store.save_local(folder_path=output_folder)

Save clean data

In [None]:
output_folder = config.ALL_EVIDENCE_VECTOR_STORE_PATH
file_path = os.path.join(output_folder, 'ground_truth.csv')

clean_ground_truth_df.drop(columns=['Unnamed: 0'], inplace=True)
clean_ground_truth_df.reset_index(drop=True, inplace=True)

clean_ground_truth_df.to_csv(file_path)