In [34]:
import faiss
import pandas as pd
import os
from tqdm import tqdm

In [2]:
df1 = pd.read_parquet("./wikipedia stem/0_to_25000.parquet")
df1.head()

Unnamed: 0,text,url,title
0,"Becurtovirus is a genus of viruses, in the fam...",https://en.wikipedia.org/wiki/Becurtovirus,Becurtovirus
1,Cyprinivirus is a genus of viruses in the orde...,https://en.wikipedia.org/wiki/Cyprinivirus,Cyprinivirus
2,"Glossinavirus is a genus of viruses, in the fa...",https://en.wikipedia.org/wiki/Glossinavirus,Glossinavirus
3,"Ichtadenovirus is a genus of viruses, in the f...",https://en.wikipedia.org/wiki/Ichtadenovirus,Ichtadenovirus
4,"Lambdatorquevirus is a genus of viruses, in th...",https://en.wikipedia.org/wiki/Lambdatorquevirus,Lambdatorquevirus


In [7]:
df1['text'].apply(lambda x: len(x.split(' '))).max()

20829

In [8]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

model_name = 'sentence-transformers/all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [32]:
def split_text_into_segments(text, max_length, stride, category):
    tokens = tokenizer.tokenize(text)
    segments = []
    categories = []

    start = 0
    while start < len(tokens):
        if start + max_length > len(tokens):
            segment_tokens = tokens[start:]
            segment = tokenizer.convert_tokens_to_string(segment_tokens)
            segments.append(segment)
            categories.append(category)
            break
        else:
            end = start + max_length
            segment_tokens = tokens[start:end]
            segment = tokenizer.convert_tokens_to_string(segment_tokens)
            segments.append(segment)
            categories.append(category)
            start = end - stride

    return segments, categories

In [21]:
long_text = "I love United States. This is a great country."
print(tokenizer.tokenize(long_text))
print(tokenizer.convert_tokens_to_string(tokenizer.tokenize(long_text)))

['i', 'love', 'united', 'states', '.', 'this', 'is', 'a', 'great', 'country', '.']
i love united states. this is a great country.


In [29]:
long_text = df1.loc[0, "text"]
stride = 64 
max_length = 384 

segments, categories = split_text_into_segments(long_text, max_length, stride)
print(segments)
print(len(segments))

['becurtovirus is a genus of viruses, in the family geminiviridae. dicotyledonous plants serve as natural hosts. there are three species in this genus. taxonomy the following three species are assigned to the genus : beet curly top iran virus exomis microphylla latent virus spinach curly top arizona virus structure viruses in becurtovirus are non - enveloped, with icosahedral geometries, and t = 1 symmetry. genomes are circular and non - segmented, around 3. 0kb in length. life cycle viral replication is nuclear. entry into the host cell is achieved by penetration into the host cell. replication follows the ssdna rolling circle model. dna - templated transcription is the method of transcription. the virus exits the host cell by nuclear pore export, and tubule - guided viral movement. dicotyledonous plants serve as the natural host. references external links viralzone : becurtovirus ictv geminiviridae virus genera']
1


In [35]:
chunked_context = []
corresponding_categories = []

for filename in os.listdir("./wikipedia stem/"):
    filepath = os.path.join("./wikipedia stem/", filename)
    print(f"Splitting file {filepath}")
    df = pd.read_parquet(filepath)

    for index, row in tqdm(df.iterrows(), total=len(df)):
        context = row["text"]
        category = row["title"]
        max_length = 384
        stride = 128

        segments, categories = split_text_into_segments(context, max_length, stride, category)
        chunked_context.extend(segments)
        corresponding_categories.extend(categories)

print(len(chunked_context), len(corresponding_categories))

100%|██████████| 25000/25000 [02:06<00:00, 197.94it/s]
100%|██████████| 25000/25000 [02:27<00:00, 169.95it/s]
100%|██████████| 6049/6049 [00:36<00:00, 166.09it/s]
100%|██████████| 25000/25000 [02:14<00:00, 186.22it/s]
100%|██████████| 25000/25000 [03:07<00:00, 133.10it/s]
100%|██████████| 25000/25000 [02:28<00:00, 168.70it/s]

541776 541776





In [36]:
df = pd.DataFrame({"context": chunked_context, "title": corresponding_categories})
df.shape

(541776, 2)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541776 entries, 0 to 541775
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   context  541776 non-null  object
 1   title    541776 non-null  object
dtypes: object(2)
memory usage: 8.3+ MB
