# Laboratory work #5 (vector database search)

In [None]:
import pandas as pd
import re

from tqdm import tqdm
import nltk
nltk.download('wordnet')
tqdm.pandas()

from sentence_transformers import SentenceTransformer
from db_utils import ChromaDataBase

In [None]:
random_seed = 42

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [None]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [None]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [None]:
df_fake.shape, df_true.shape

In [None]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10, random_state=random_seed)

In [None]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

In [None]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [None]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [None]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

In [None]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [None]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

In [None]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [None]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

In [None]:
def split_into_words(sentences):
    # Regular expression to match URLs, hashtags, handles, words, and standalone punctuation
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|[\w\'-]+|[.,!?;]')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

In [None]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [None]:
df['sentences'] = df['text'].apply(process_text)

In [None]:
df

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

In [None]:
def vectorize_sentences(sentences):
    joined_sentences = [' '.join(sentence) for sentence in sentences]
    return model.encode(joined_sentences)

In [None]:
df = df.iloc[:5000, :]

In [None]:
df['sentence_vectors'] = df['sentences'].progress_apply(vectorize_sentences)

In [None]:
len(df.iloc[0, -2]), df.iloc[0, -1].shape

In [None]:
df.head()

In [None]:
texts = [' '.join(sentence) for document in df['sentences'].to_list() for sentence in document]

In [None]:
embeddings = [embedding.tolist() for document in df['sentence_vectors'] for embedding in document]

In [None]:
assert df.iloc[0, -1][1][0] == embeddings[1][0]

In [None]:
metadatas = [{'class': document['class']} for i, document in df.iterrows() for _ in document['sentences']]

In [None]:
ids = [f'doc_{i}/sen_{j}' for i, document in df.iterrows() for j, s in enumerate(document['sentences'])]

In [None]:
assert len(texts) == len(embeddings) == len(metadatas) == len(ids)

In [None]:
len(ids)

In [None]:
db = ChromaDataBase()

In [None]:
db.add(embeddings, texts, metadatas, ids)

In [None]:
db.collection.count()

In [None]:
result = db.query(
    query_texts=['Catalan pro-independence party'],
    n_results=5
)
result

In [None]:
result = db.query(
    query_embeddings=[model.encode('Slovenian forests', normalize_embeddings=True).tolist()],
    n_results=5,
)
result

In [None]:
result = db.query(
    query_texts=['elections'],
    n_results=5,
    where={'class': 0},
    where_document={'$contains': 'US'}
)
result