In [1]:
import pandas as pd

pd.set_option("display.max_colwidth", 200)

# Statistics about the data

In [2]:
cleaned_data_path = "data/parsed_cleaned_data.csv"

In [3]:
df = pd.read_csv(cleaned_data_path)
df.sample(10)

Unnamed: 0,id,title,ns,text
1375,747912,Ohmwrecker,0,{{Item info|automaticgv = true|goldvalue = {{gold value|Gold Value of Passive|nolink=}} * 20% [[movement speed]] = {{g|790}}** '''Total Gold Value''' = {{g|3340}}|goldefficiency = * {{ii|Ohmwrecke...
13581,1538107,Master Yi/LoR/PoC,0,{{PoC champion infobox|championname = Master Yi|campaign0 = {{#invoke:LoRPoCData|getChampCampaign|Master Yi|0|true}}|campaign1 = {{#invoke:LoRPoCData|getChampCampaign|Master Yi|1|true}}|camp...
4067,1331226,Category:Ornn skins,14,This category contains all skins related to {{ci|Ornn}}.[[de:Category:Ornn Splasharts]][[Category:Champion skins]][[Category:Ornn]]
3962,1327552,V7.12,0,"{{Infobox patch |Image = <gallery> Riven DawnbringerSkin.jpg|Riven Yasuo NightbringerSkin.jpg|Yasuo </gallery> |Caption = ''""Not all angels are good.""'' |Highlights = * New Legendary skins: ** ..."
11278,1490054,Yuumi/LoL/History,0,"{{Section top}} {{LoL navigation}} == Previous Abilities == {{Ability|I |name = Bop 'n' Block |icon = Bop 'n' Block.png |description = {{sbc|Innate:}} Periodically, '''Yuumi's'''..."
9915,1481382,Category:Unused Rakan skins,14,This category contains all unused skins related to {{ci|Rakan}}.[[Category:Rakan skins]][[Category:Unused champion skins|Rakan skins]]
14411,1540323,Category:WR Xayah circles,14,This category contains all {{tip|Wild Rift}} circle images related to {{WRc|Xayah}}.[[Category:Xayah]][[Category:WR champion circles|Xayah circles]]
1901,1061391,Category:Anivia skins,14,This category contains all skins related to {{ci|Anivia}}.[[de:Kategorie:Anivia Splasharts]][[fr:Catégorie:Anivia skins]][[Category:Champion skins]][[Category:Anivia]]
14108,1539990,Category:Jayce tiles,14,This category contains all tiles related to {{ci|Jayce}}.[[Category:Jayce]][[Category:Champion tiles]]
15979,1590334,Captain Rowain,0,#REDIRECT:[[Minor_Characters/Piltover#Rowain]]


In [4]:
df["text"] = df["text"].astype(str)
df["title"] = df["title"].astype(str)

In [5]:
# stats about letters count
df["text"].str.len().describe()

count     17089.000000
mean       3827.348704
std       10497.537494
min           3.000000
25%         143.000000
50%         372.000000
75%        3098.000000
max      531124.000000
Name: text, dtype: float64

In [6]:
# stats about word count
df["text"].str.split().str.len().describe()

count    17089.000000
mean       476.883258
std       1399.983929
min          1.000000
25%         13.000000
50%         39.000000
75%        323.000000
max      82048.000000
Name: text, dtype: float64

In [7]:
df["text"].str.split().str.len().sort_values(ascending=False).head(30)

462      82048
1044     43475
11418    23426
3614     20921
11845    20282
15937    18281
8470     17845
5140     15636
262      15614
190      15596
16674    15584
261      15276
191      15188
571      14861
5942     14816
3838     14330
12314    14001
16865    13911
3114     13889
12291    13367
6845     13233
16828    13157
2328     12982
13229    12706
13118    12176
543      12073
2852     12043
3850     11874
6255     11823
456      11772
Name: text, dtype: int64

In [8]:
def split_text_to_multiple_rows(df, text_column, id_column, word_limit, additional_columns):
    new_rows = []
    
    for _, row in df.iterrows():
        text = row[text_column]
        id_prefix = row[id_column]
        words = text.split()
        
        # Split the text into chunks of 'word_limit' words
        chunks = [words[i:i + word_limit] for i in range(0, len(words), word_limit)]
        
        # Create new rows with updated ids and same values for additional columns
        for idx, chunk in enumerate(chunks):
            new_id = f"{id_prefix}-{idx:03d}"
            new_text = ' '.join(chunk)
            new_row = {id_column: new_id, text_column: new_text}
            for col in additional_columns:
                new_row[col] = row[col]
            new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    
    return new_df


# Embedding mode
there are lots of embedding models, you can see which one is performing good on HF leaderboard and suits your needs
https://huggingface.co/spaces/mteb/leaderboard


the max token is important for our chunking word limit, token are about 0.75 word

In [9]:
# word_limit depend on which model you are using
df = split_text_to_multiple_rows(df, 'text', 'id', word_limit=5700, additional_columns=['title', 'ns'])
df

Unnamed: 0,id,text,title,ns
0,2030-000,"{{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a ""game available through the League Client...",League of Legends,0
1,2032-000,"<!--{{Stub|This page needs more on the characteristics champions have, or links to the relevant pages for such things.}}--> :''This is about the {{tip|League of Legends}} player-controlled units, ...",Champion,0
2,2040-000,"[[File:Urf OriginalSkin.jpg|thumb|300px|Urf, the Manatee]] : ''Not to be confused with {{tip|Ultra Rapid Fire}}'' : ''For other terms, visit [[Urf (disambiguation)]]'' {{SeeOther|species|[[Manatee...",Urf,0
3,2043-000,{{Section top}}{{Game banner|Nunu|disp_name=Nunu & Willump}}{{LoL navigation}}{{Infobox champion|Nunu|disp_name=Nunu & Willump}}{{LoL navigation mobile}}{{Infobox stats|Nunu}}{{clr}}The champion {...,Nunu/LoL,0
4,2044-000,{{Section top}}{{Game banner|Alistar}}{{LoL navigation}}{{Infobox champion|Alistar}}{{LoL navigation mobile}}{{Infobox stats|Alistar}}{{clr}}== Abilities =={{Data Alistar/I|Ability}}{{Data Alistar...,Alistar/LoL,0
...,...,...,...,...
17319,1631196-000,"<!--The temporary, non-official code for this relic is RX001-->{{PoC item infobox|custom|name=Succubus's Brand|image=Succubus's Brand LoR relic.png|imagesize=200px|type=Relic|desc=<nowiki>+1|+1</n...",Succubus's Brand (The Path of Champions),0
17320,1631197-000,"<!--The temporary, non-official code for this relic is RX002-->{{PoC item infobox|custom|name=Voidborne Carapace|image=Voidborne Carapace LoR relic.png|imagesize=200px|type=Relic|desc={{tipLoR|Evo...",Voidborne Carapace (The Path of Champions),0
17321,1631198-000,"<!--The temporary, non-official code for this item is IX005-->{{PoC item infobox|custom|name=Arcane Knowledge|image=06RU043-full.png|imagesize=200px|type=Item|desc=When I'm summoned, draw a spell....",Arcane Knowledge (The Path of Champions),0
17322,1631199-000,"<!--The temporary, non-official code for this item is IX006-->{{PoC item infobox|custom|name=Immortal|image=08NX028-full.png|imagesize=200px|type=Item|desc=<nowiki>+1|+0</nowiki> and {{tipLoR|Deat...",Immortal (The Path of Champions),0


In [10]:
# import torch.nn.functional as F
# from transformers import AutoModel, AutoTokenizer
# import torch
# from torch.utils.data import DataLoader, TensorDataset

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model_path = 'Alibaba-NLP/gte-large-en-v1.5'
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModel.from_pretrained(model_path, trust_remote_code=True,
#                                   unpad_inputs=True,
#                                   use_memory_efficient_attention=True).to(device)

# # Tokenize the input texts
# inputs = tokenizer(df["text"].to_list(), max_length=8192, padding=True, truncation=True, return_tensors='pt')

# # Create a TensorDataset and DataLoader for batching
# dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
# batch_size = 1  # Specify your batch size here
# dataloader = DataLoader(dataset, batch_size=batch_size)

# all_embeddings = []

# with torch.autocast(device_type=device.type, dtype=torch.float16):  # or bfloat16
#     with torch.inference_mode():
#         for batch in dataloader:
#             batch = {k: v.to(device) for k, v in zip(inputs.keys(), batch)}
#             outputs = model(**batch)
#             embeddings = outputs.last_hidden_state[:, 0]
#             all_embeddings.append(embeddings)

# # Concatenate all embeddings
# all_embeddings = torch.cat(all_embeddings, dim=0)

# # (Optionally) normalize embeddings
# # all_embeddings = F.normalize(all_embeddings, p=2, dim=1)
# # scores = (all_embeddings[:1] @ all_embeddings[1:].T) * 100
# # print(scores.tolist())


In [11]:
# !pip install -U sentence-transformers

In [12]:
# "Title: " + df["title"] + ": "

In [13]:
"Title: " + df["title"] + ": " + df["text"]

0        Title: League of Legends: {{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a "game available...
1        Title: Champion: <!--{{Stub|This page needs more on the characteristics champions have, or links to the relevant pages for such things.}}--> :''This is about the {{tip|League of Legends}} player-c...
2        Title: Urf: [[File:Urf OriginalSkin.jpg|thumb|300px|Urf, the Manatee]] : ''Not to be confused with {{tip|Ultra Rapid Fire}}'' : ''For other terms, visit [[Urf (disambiguation)]]'' {{SeeOther|speci...
3        Title: Nunu/LoL: {{Section top}}{{Game banner|Nunu|disp_name=Nunu & Willump}}{{LoL navigation}}{{Infobox champion|Nunu|disp_name=Nunu & Willump}}{{LoL navigation mobile}}{{Infobox stats|Nunu}}{{cl...
4        Title: Alistar/LoL: {{Section top}}{{Game banner|Alistar}}{{LoL navigation}}{{Infobox champion|Alistar}}{{LoL navigation mobile}}{{Infobox stats|Alistar}}{

In [14]:

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import torch
import torch.nn.functional as F


model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True, model_kwargs={"torch_dtype":torch.bfloat16})
embeddings = model.encode("Title: " + df["title"] + ": " + df["text"], batch_size=1, show_progress_bar=True)


  from tqdm.autonotebook import tqdm, trange
2024-06-21 16:24:00.894782: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-21 16:24:00.921221: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-21 16:24:00.921247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-21 16:24:00.922033: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-21 16:24:00.926359

Batches:   0%|          | 0/17324 [00:00<?, ?it/s]

1.2265625