In [39]:
import pandas as pd
import gc
import re
import textwrap
pd.set_option("display.max_colwidth", 200)

# Statistics about the data

In [40]:
cleaned_data_path = "data/parsed_cleaned_data.csv"

In [41]:
df = pd.read_csv(cleaned_data_path)
df.sample(10)

Unnamed: 0,id,title,ns,text
1786,1058068,Category:Nautilus,14,"{{Championtip|Nautilus}}{{clr}}This category contains subcategories, pages, and files relating to or featuring the champion {{ci|Nautilus}}.[[de:Kategorie:Nautilus]][[fr:Catégorie:Nautilus]][[pl:K..."
4631,1363127,Nimbus Cloak,0,{{rune header|Nimbus Cloak}}== Notes ==[[File: Nimbus Cloak trigger VFX.png|thumb|200px|Trigger VFX]][[File: Nimbus Cloak MS boost VFX.png|thumb|200px|Movement Speed VFX]]* The movement speed bonu...
6502,1414515,Haunted Relic (Legends of Runeterra),0,"{{Card infobox|01SI007}}{{Card infobox|01SI007T1}}== Trivia ==* Internally, ''Haunted Relic'' is called '''Cursed Talisman'''.* The symbol of {{fi|Blessed Isles}} is featured in the background of ..."
7541,1446046,Rank (Legends of Runeterra),0,"{{Article game navigation}}{{Quote|{{sm2|VO_Ezreal_levelup_03.ogg}} ""I'm so good I surprise myself!""|{{LoR|Ezreal}}}}'''Ranked''' is a [[Constructed (Legends of Runeterra)|Constructed]] game mode ..."
2268,1104236,KDA,0,{{DISPLAYTITLE:KDA (disambiguation)}}{{Disambig|KDA}}Skins:* [[Riot Records/KDA|Riot Records (Universe)/KDA]]** K/DA skins** K/DA All Out skins{{tip|League of Legends}}* [[Kill to Death Ratio]]{{t...
14621,1541530,Category:High definition Maokai ability icons,14,Gallery of {{cis|Maokai}} High definition [[champion ability]] icons from {{tip|League of Legends}}.[[Category:High definition champion ability icons|Maokai]][[Category:Maokai ability icons]]
12290,1512452,Category:TFT augment UI images,14,[[Category:TFT augments]]
104,2919,Null-Magic Mantle,0,{{Item info|goldvalue = |goldefficiency =|similaritems = |notes =|strategy = * {{iis|Null-Magic Mantle}} additional magic resistance can serve to be very valuable in the early phases of the game a...
8273,1464134,Four Beasts (Universe),0,"{{Infobox alternate universe|name = Four Beasts|image= <gallery>Dragonmancers 2021 Promo 01.jpg</gallery>|by=[[Jared Rosen]], [[Cat Manning]], [[Odin Austin Shafer]]|related = Anivia, Ashe, Aureli..."
16439,1596312,Clash of Giants (Legends of Runeterra),0,"{{Card infobox|08FR018}}== Trivia ==* {{LoR|Volibear}} and {{LoR|Rhond, the Magma Serpent}} are featured in the artwork.== Change Log =={| class=""article-table ruling-table""! colspan=""2"" | Clash o..."


In [42]:
df["text"] = df["text"].astype(str)
df["title"] = df["title"].astype(str)

In [43]:
# stats about letters count
df["text"].str.len().describe()

count     17089.000000
mean       3827.348704
std       10497.537494
min           3.000000
25%         143.000000
50%         372.000000
75%        3098.000000
max      531124.000000
Name: text, dtype: float64

In [44]:
# stats about word count
df["text"].str.split().str.len().describe()

count    17089.000000
mean       476.883258
std       1399.983929
min          1.000000
25%         13.000000
50%         39.000000
75%        323.000000
max      82048.000000
Name: text, dtype: float64

In [45]:
df["text"].str.split().str.len().sort_values(ascending=False).head(10)

462      82048
1044     43475
11418    23426
3614     20921
11845    20282
15937    18281
8470     17845
5140     15636
262      15614
190      15596
Name: text, dtype: int64

In [46]:
# clean text and remove html and xml tags

def clean_text(text):

    # get champ before remving {{}}
    # text = re.sub(r'\{\{[^|]*\|([^}]*)\}\}', r'\1', text)

    # Remove {{}} formatting
    text = re.sub(r"\{\{(.*?)\}\}", r"\1", text)
    
    # remove brackets [[item]]
    # text = re.sub(r'\[+([^[\]]+)\]+', r'\1', text)

    # Remove [[]] links
    # text = re.sub(r'\[\[[^]]*\]\]', '', text)
    text = re.sub(r"<ref>.*?</ref>", "", text)
    
    # remove links
    # text = re.sub(r"http[s]?://\S+\s", "", text)

    # Remove <> tags
    text = re.sub(r'<[^>]*>', '', text)
    
    # Remove File: references
    text = re.sub(r'\[\[File:[^\]]*\]\]', '', text)

    # Remove single brackets []
    text = re.sub(r'\[[^]]*\]', '', text)

    # Remove __TOC__ and similar
    text = re.sub(r'__[A-Z]+__', '', text)

    # Remove asterisks
    text = re.sub(r'\*', '', text)
    
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove equal signs
    pattern = r'\=+([^\=]*?)\=+'
    # Replace the matched pattern
    text = re.sub(pattern, r'\1:', text)
    
    # Remove trailing "]"
    text = re.sub(r'[\] ]+$', '', text)
    
    # Trim leading and trailing whitespace
    text = text.strip()

    
    return text

In [47]:
df["full_text"] = df["text"]
df["text"] = df["text"].apply(clean_text)


In [48]:
def split_text_to_multiple_rows(df, text_column, id_column, word_limit, additional_columns):
    new_rows = []
    
    for _, row in df.iterrows():
        text = row[text_column]
        id_prefix = row[id_column]
        words = text.split()
        
        # Split the text into chunks of 'word_limit' words
        chunks = [words[i:i + word_limit] for i in range(0, len(words), word_limit)]
        
        # Create new rows with updated ids and same values for additional columns
        for idx, chunk in enumerate(chunks):
            new_id = f"{id_prefix}-{idx:03d}"
            new_text = ' '.join(chunk)
            new_row = {id_column: new_id, text_column: new_text}
            for col in additional_columns:
                new_row[col] = row[col]
            new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    
    return new_df


In [49]:
# word_limit depend on which model you are using
# optimal chunk size is 1024 token and tokens are about 0.75 word
df = split_text_to_multiple_rows(df, 'text', 'id', word_limit=int(1024*0.75), additional_columns=['title', 'ns', "full_text"])
df

Unnamed: 0,id,text,title,ns,full_text
0,2030-000,"Portal/LOL Cleanup| Game modes section links to ], which is legacy terminology. Game modes section includes TFT but describes TFT as a ""game available through the League Client"" and the ] page des...",League of Legends,0,"{{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a ""game available through the League Client..."
1,2030-001,main|Summoner's Rift (League of Legends) '''Summoner's Rift''' resembles the ''Defense of the Ancients'' map with three lanes and supports five players per side. The maps put two teams with a fixe...,League of Legends,0,"{{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a ""game available through the League Client..."
2,2030-002,These items boost stats of the champion. Some items can be combined into more powerful items by following predefined recipes. ;Original Champion Roster Column|3| {{ci|Alistar ci|Amumu ci|Anivia ci...,League of Legends,0,"{{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a ""game available through the League Client..."
3,2030-003,- IT'S ON Gameplay Trailer| Champions in Season 2021- Dev Video - League of Legends| Esports in Season 2021- Esports - Riot Games| Gameplay in Season 2021 - Dev Video - League of Legends| Skins & ...,League of Legends,0,"{{Portal/LOL}} {{Cleanup| * Game modes section links to [[Classic]], which is legacy terminology. * Game modes section includes TFT but describes TFT as a ""game available through the League Client..."
4,2032-000,":''This is about the tip|League of Legends player-controlled units, and their depiction in the ]. For other uses of '''Champion''', see ]'' ).]] '''Champions''' are the player-controlled character...",Champion,0,"<!--{{Stub|This page needs more on the characteristics champions have, or links to the relevant pages for such things.}}--> :''This is about the {{tip|League of Legends}} player-controlled units, ..."
...,...,...,...,...,...
23033,1631196-000,"PoC item infobox|custom|nameSuccubus's Brand|image:Succubus's Brand LoR relic.png|imagesize200px|type:Relic|desc+1|+1 and when I kill a unit summon a random ].|rarity:Rare Change Log :{| class""art...",Succubus's Brand (The Path of Champions),0,"<!--The temporary, non-official code for this relic is RX001-->{{PoC item infobox|custom|name=Succubus's Brand|image=Succubus's Brand LoR relic.png|imagesize=200px|type=Relic|desc=<nowiki>+1|+1</n..."
23034,1631197-000,"PoC item infobox|custom|nameVoidborne Carapace|image:Voidborne Carapace LoR relic.png|imagesize200px|type:Relic|desc{{tipLoR|EvolveWhen ANY unit dies, grant me its keywords.|rarity:Rare}} Change L...",Voidborne Carapace (The Path of Champions),0,"<!--The temporary, non-official code for this relic is RX002-->{{PoC item infobox|custom|name=Voidborne Carapace|image=Voidborne Carapace LoR relic.png|imagesize=200px|type=Relic|desc={{tipLoR|Evo..."
23035,1631198-000,"PoC item infobox|custom|nameArcane Knowledge|image:06RU043-full.png|imagesize200px|type:Item|descWhen I'm summoned, draw a spell.|rarity:Common|Unitstrue: Change Log {| class:""article-table ruling...",Arcane Knowledge (The Path of Champions),0,"<!--The temporary, non-official code for this item is IX005-->{{PoC item infobox|custom|name=Arcane Knowledge|image=06RU043-full.png|imagesize=200px|type=Item|desc=When I'm summoned, draw a spell...."
23036,1631199-000,"PoC item infobox|custom|nameImmortal|image:08NX028-full.png|imagesize200px|type:Item|desc+1|+0 and {{tipLoR|Deathless.|rarity:Common|Unitstrue}}: Change Log {| class:""article-table ruling-table""! ...",Immortal (The Path of Champions),0,"<!--The temporary, non-official code for this item is IX006-->{{PoC item infobox|custom|name=Immortal|image=08NX028-full.png|imagesize=200px|type=Item|desc=<nowiki>+1|+0</nowiki> and {{tipLoR|Deat..."


In [50]:
# Remove rows with text less than 14 words
display(df[df["text"].str.split().str.len() <= 14])
df = df[df["text"].str.split().str.len() >= 14]

Unnamed: 0,id,text,title,ns,full_text
56,2150-000,General category for ]-related stuff.,Category:Champions,14,General category for [[champion]]-related stuff.[[de:Kategorie:Champions]][[fr:Catégorie:Champions]][[pl:Kategoria:Bohaterowie]][[Category:Gameplay elements]]
57,2155-000,There are five main categories of ''']s''' in ]: ] ] ] ] ]Items,Category:Items,14,__NOTOC__There are five main categories of '''[[item]]s''' in [[League of Legends]]:* [[:Category:Starter items‎|Starter items‎]]* [[:Category:Basic items‎|Basic items‎]]* [[:Category:Epic items‎|...
89,2399-000,These are the many different types of ]s.,Category:Runes,14,These are the many different types of [[rune]]s.[[de:Kategorie:Runen]][[Category:Summoner]]
93,2401-001,match): 95 IP: References :Reflist,Influence Point,0,"{{removed}}Influence Points ({{IP|'''IP'''}}), were one of two currencies used in [[League of Legends]] prior to being replaced by {{BE|Blue Essence}} in [[V7.22]]. Narratively, they were a measur..."
103,2411-000,All the Summoners Stuff goes in here.,Category:Summoner,14,All the Summoners Stuff goes in here.[[Category:Client]]
...,...,...,...,...,...
23028,1631158-000,Section topLoL navigation#invoke:SkinData|skinpage|SmolderScreenshots: Eternals champion eternals|Smolder:Trivia;cst|Smolder|Original ;cst|Smolder|? Clr:References:Reflist|2,Smolder/LoL/Cosmetics,0,{{Section top}}{{LoL navigation}}{{#invoke:SkinData|skinpage|Smolder}}==Screenshots==== Eternals =={{champion eternals|Smolder}}==Trivia==;{{cst|Smolder|Original}}* ;{{cst|Smolder|?}}* {{Clr}}==Re...
23029,1631159-000,Section topLoL navigationSee also:Champions,Smolder/LoL/History,0,{{Section top}}{{LoL navigation}}==See also=={{Champions}}[[Category:Smolder]][[Category:LoL history]]
23030,1631160-000,LoL navigationRelease version:See also:Champions,Smolder/LoL/Patch history,0,{{LoL navigation}}==Release version====See also=={{Champions}}[[Category:Smolder]][[Category:LoL patch history]]
23031,1631161-000,Trivia compilation|Smolder,Smolder/Trivia,0,{{Trivia compilation|Smolder}}[[Category:Smolder]][[Category:Champion trivia]]


In [51]:
df.drop(columns="full_text", inplace=True)
df.to_csv("./data/chunks_data.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns="full_text", inplace=True)
