In [1]:
import pandas as pd
import gc
import re
import textwrap
pd.set_option("display.max_colwidth", 200)

# Statistics about the data

In [2]:
cleaned_data_path = "data/parsed_cleaned_data.csv"

In [3]:
df = pd.read_csv(cleaned_data_path)
df.sample(10)

Unnamed: 0,id,title,ns,text
16719,1620228,K/DA (Teamfight Tactics),0,{{SeeOther|gameplay element|[[Kill to Death Ratio]]}}{{SeeOther|series of alternate future/universe pop music skins|[[Riot Records/KDA|K/DA]]}}<tabber>Set 10={{TFT trait infobox|K/DA|set=10}}== No...
1528,872272,List of champions/Mana regeneration,0,"This list includes each champion's '''base''' mana regeneration value at levels 1 and 18, as well as their mana regeneration growth. Mana regeneration gained by means other than growth are exclude..."
15041,1551179,Patch (League of Legends)/Season Thirteen,0,"{{Outdated if new patch|V13.24}}<tabber>Season Thirteen={| class=""wikitable"" style=""text-align:center; border-collapse:collapse; border:1px solid #444; width:100%; color:white;"" border=""1"" cellspa..."
14771,1541779,Category:Unused Xin Zhao skins,14,This category contains all unused skins related to {{ci|Xin Zhao}}.[[Category:Xin Zhao skins]][[Category:Unused champion skins|Xin Zhao skins]]
3796,1312035,The Spirit Walker,0,"{{Lore header|The Spirit Walker}} ==Biography== Whenever the moon rises into a wintery sky, round and red as blood, the spirit walkers of the {{fi|Freljord}} know that another of their kind has be..."
13932,1539806,Category:Old Wukong centered skins,14,This category contains all old centered skin images related to {{ci|Wukong}}.[[Category:Wukong centered skins]][[Category:Old champion centered skins|Wukong skins]]
14936,1545010,War is Heck/Commando,0,{{Section top}} == Lore == * {{csl|Galio|Commando|size=30px|lore=true}} * {{csl|Gangplank|Special Forces|size=30px|lore=true}} * {{csl|Garen|Commando|size=30px|lore=true}} * {{csl|Jarvan IV|Comman...
13518,1536206,Kinetex Machina (Universe),0,"{{Infobox alternate universe|name = Kinetex Machina|image=<gallery>Blitzcrank ZenithGamesSkin.jpg</gallery>|related = Blitzcrank, Jayce, Lee Sin}}'''Kinetex Machina''' is a series of alternate fut..."
2378,1139623,Category:Urgot ability videos,14,[[Category:Urgot]][[Category:Ability videos]]
5834,1401450,World Rune Archives,0,{{Infobox place|name =|image = <gallery>Shurima Guardians Of Knowledge.jpg|Nasus and Ryze at the entrance of the archives.</gallery>|othername =|nickname =|alias = Ancient Archives|nation = {{fi|S...


In [4]:
df["text"] = df["text"].astype(str)
df["title"] = df["title"].astype(str)

In [5]:
# stats about letters count
df["text"].str.len().describe()

count     17089.000000
mean       3827.348704
std       10497.537494
min           3.000000
25%         143.000000
50%         372.000000
75%        3098.000000
max      531124.000000
Name: text, dtype: float64

In [6]:
# stats about word count
df["text"].str.split().str.len().describe()

count    17089.000000
mean       476.883258
std       1399.983929
min          1.000000
25%         13.000000
50%         39.000000
75%        323.000000
max      82048.000000
Name: text, dtype: float64

In [7]:
df["text"].str.split().str.len().sort_values(ascending=False).head(10)

462      82048
1044     43475
11418    23426
3614     20921
11845    20282
15937    18281
8470     17845
5140     15636
262      15614
190      15596
Name: text, dtype: int64

In [8]:
# clean text and remove html and xml tags

def clean_text(text):

    # get champ before remving {{}}
    # text = re.sub(r'\{\{[^|]*\|([^}]*)\}\}', r'\1', text)

    # Remove {{}} formatting
    text = re.sub(r"\{\{(.*?)\}\}", r"\1", text)
    
    # remove brackets [[item]]
    # text = re.sub(r'\[+([^[\]]+)\]+', r'\1', text)

    # Remove [[]] links
    # text = re.sub(r'\[\[[^]]*\]\]', '', text)
    text = re.sub(r"<ref>.*?</ref>", "", text)
    
    # remove links
    # text = re.sub(r"http[s]?://\S+\s", "", text)

    # Remove <> tags
    text = re.sub(r'<[^>]*>', '', text)
    
    # Remove File: references
    text = re.sub(r'\[\[File:[^\]]*\]\]', '', text)

    # Remove single brackets []
    text = re.sub(r'\[[^]]*\]', '', text)

    # Remove __TOC__ and similar
    text = re.sub(r'__[A-Z]+__', '', text)

    # Remove asterisks
    text = re.sub(r'\*', '', text)
    
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove equal signs
    pattern = r'\=+([^\=]*?)\=+'
    # Replace the matched pattern
    text = re.sub(pattern, r'\1:', text)
    
    # Remove trailing "]"
    text = re.sub(r'[\] ]+$', '', text)
    
    # Trim leading and trailing whitespace
    text = text.strip()

    
    return text

In [9]:
df["text"] = df["text"].apply(clean_text)

In [10]:
def split_text_to_multiple_rows(df, text_column, id_column, word_limit, additional_columns):
    new_rows = []
    
    for _, row in df.iterrows():
        text = row[text_column]
        id_prefix = row[id_column]
        words = text.split()
        
        # Split the text into chunks of 'word_limit' words
        chunks = [words[i:i + word_limit] for i in range(0, len(words), word_limit)]
        
        # Create new rows with updated ids and same values for additional columns
        for idx, chunk in enumerate(chunks):
            new_id = f"{id_prefix}-{idx:03d}"
            new_text = ' '.join(chunk)
            new_row = {id_column: new_id, text_column: new_text}
            for col in additional_columns:
                new_row[col] = row[col]
            new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    
    return new_df


In [11]:
# word_limit depend on which model you are using
# optimal chunk size is 1024 token and tokens are about 0.75 word
df = split_text_to_multiple_rows(df, 'text', 'id', word_limit=int(1024*0.75), additional_columns=['title', 'ns'])
df

Unnamed: 0,id,text,title,ns
0,2030-000,"Portal/LOL Cleanup| Game modes section links to ], which is legacy terminology. Game modes section includes TFT but describes TFT as a ""game available through the League Client"" and the ] page des...",League of Legends,0
1,2030-001,main|Summoner's Rift (League of Legends) '''Summoner's Rift''' resembles the ''Defense of the Ancients'' map with three lanes and supports five players per side. The maps put two teams with a fixe...,League of Legends,0
2,2030-002,These items boost stats of the champion. Some items can be combined into more powerful items by following predefined recipes. ;Original Champion Roster Column|3| {{ci|Alistar ci|Amumu ci|Anivia ci...,League of Legends,0
3,2030-003,- IT'S ON Gameplay Trailer| Champions in Season 2021- Dev Video - League of Legends| Esports in Season 2021- Esports - Riot Games| Gameplay in Season 2021 - Dev Video - League of Legends| Skins & ...,League of Legends,0
4,2032-000,":''This is about the tip|League of Legends player-controlled units, and their depiction in the ]. For other uses of '''Champion''', see ]'' ).]] '''Champions''' are the player-controlled character...",Champion,0
...,...,...,...,...
23033,1631196-000,"PoC item infobox|custom|nameSuccubus's Brand|image:Succubus's Brand LoR relic.png|imagesize200px|type:Relic|desc+1|+1 and when I kill a unit summon a random ].|rarity:Rare Change Log :{| class""art...",Succubus's Brand (The Path of Champions),0
23034,1631197-000,"PoC item infobox|custom|nameVoidborne Carapace|image:Voidborne Carapace LoR relic.png|imagesize200px|type:Relic|desc{{tipLoR|EvolveWhen ANY unit dies, grant me its keywords.|rarity:Rare}} Change L...",Voidborne Carapace (The Path of Champions),0
23035,1631198-000,"PoC item infobox|custom|nameArcane Knowledge|image:06RU043-full.png|imagesize200px|type:Item|descWhen I'm summoned, draw a spell.|rarity:Common|Unitstrue: Change Log {| class:""article-table ruling...",Arcane Knowledge (The Path of Champions),0
23036,1631199-000,"PoC item infobox|custom|nameImmortal|image:08NX028-full.png|imagesize200px|type:Item|desc+1|+0 and {{tipLoR|Deathless.|rarity:Common|Unitstrue}}: Change Log {| class:""article-table ruling-table""! ...",Immortal (The Path of Champions),0


In [12]:
df.to_csv("./data/chunks_data.csv",index=False)