In [1]:
import string
from collections import Counter

import requests as requests
import pandas as pd
from bs4 import BeautifulSoup

response = requests.get('https://minecraft.wiki/enchanting')
print(response.status_code)
soup = BeautifulSoup(response.content, 'html.parser')

print(soup.p.get_text())

200
Enchanting is the process of improving armor, tools, and weapons. A glint animation appears on items to show that they are enchanted.



In [2]:
dfl = pd.read_html(response.content, header=0)
df = dfl[0]
dfstr = df.astype(str)
df

Unnamed: 0,Name,Icon,Usage
0,Enchanting Table,,Used to enchant items.
1,Bookshelf,,Allows the enchanting table to apply higher-le...
2,Lapis Lazuli,,Required to power the enchanting table (up to ...
3,Anvil,,"Used to combine enchanted items (tools, armor,..."
4,Grindstone,,Used to remove all non-curse enchantments on a...
5,Enchanted Book,,Can be combined with another item through an a...


In [3]:
words = (
    pd.Series(dfstr.values.flatten())
    .str.split()
    .explode()
    .value_counts()
    .reset_index()
)

words.columns = ['Word', 'Count']

words = words[words['Word'] != 'nan']
words


Unnamed: 0,Word,Count
0,to,9
1,the,8
2,an,7
4,enchantments,4
5,of,4
...,...,...
101,points,1
102,based,1
103,enchantment,1
104,its,1


In [4]:
divs = soup.find('div', class_='mw-parser-output')
for edit_link in divs.find_all(class_='mw-editsection'):
    edit_link.decompose()

header_to_del = ['Navigation', 'References']

for header_name in header_to_del:
    header = divs.find(id=header_name)
    if header:
        header.decompose()

classes_to_del = ['mw-references-wrap']

for class_name in classes_to_del:
    classes = divs.find_all(class_=class_name)
    for clas in classes:
        clas.decompose()

for navbox in divs.find_all(class_='navbox'):
    navbox.decompose()

text = divs.get_text()

with open('smeltin_test.txt', 'w') as f:
    f.write(text)

# Some more classes to nuke found by llms
#     wrappers_to_nuke = [
#     'mw-references-wrap',
#     'navbox',
#     'infobox',
#     'toc',
#     'mw-editsection',
#     'hatnote',            # "See also:", "Main article:"
#     'searchaux',          # Klasa pomocnicza często występująca z hatnote/msgbox
#     'msgbox',             # Ramki z ostrzeżeniami/informacjami technicznymi
#     'thumb',              # Kontenery z obrazkami i podpisami (opcjonalnie)
#     'figure',             # Znacznik HTML5 dla obrazków (często na Wiki)
#     'catlinks',           # Kategorie na samym dole strony
#     'printfooter',        # "Retrieved from..." na dole
#     'mw-indicators'       # Ikonki w prawym górnym rogu (np. kłódka)
# ]



In [5]:
text = text.lower()

to_del_signs = string.punctuation + string.digits  + '×' + '–' + '⁄' + '\u200c'

for sign in to_del_signs:
    text = text.replace(sign, ' ')

words_list = text.split()

words_list

['enchant',
 'redirects',
 'here',
 'for',
 'enchanting',
 'mechanics',
 'see',
 'enchanting',
 'table',
 'mechanics',
 'for',
 'the',
 'command',
 'see',
 'commands',
 'enchant',
 'for',
 'the',
 'minecraft',
 'dungeons',
 'mechanic',
 'see',
 'mcd',
 'enchantment',
 'enchanted',
 'redirects',
 'here',
 'for',
 'the',
 'featured',
 'server',
 'see',
 'featured',
 'servers',
 '§',
 'list',
 'of',
 'featured',
 'servers',
 'it',
 'has',
 'been',
 'suggested',
 'that',
 'this',
 'page',
 'be',
 'split',
 'into',
 'enchanting',
 'and',
 'enchantment',
 'discuss',
 'if',
 'this',
 'split',
 'affects',
 'many',
 'pages',
 'or',
 'may',
 'potentially',
 'be',
 'controversial',
 'do',
 'not',
 'split',
 'until',
 'a',
 'consensus',
 'has',
 'been',
 'reached',
 'an',
 'enchanting',
 'table',
 'surrounded',
 'by',
 'bookshelves',
 'enchanting',
 'is',
 'the',
 'process',
 'of',
 'improving',
 'armor',
 'tools',
 'and',
 'weapons',
 'a',
 'glint',
 'animation',
 'appears',
 'on',
 'items',
 'to

In [6]:
counter = Counter(words_list)
counter


Counter({'the': 289,
         'of': 170,
         'and': 126,
         'to': 113,
         'a': 101,
         'an': 88,
         'enchantments': 86,
         'level': 86,
         'item': 85,
         'enchantment': 84,
         'be': 77,
         'iii': 74,
         'in': 73,
         'enchanted': 71,
         'enchanting': 63,
         'is': 59,
         'on': 59,
         'no': 59,
         'can': 56,
         'with': 56,
         'items': 52,
         'protection': 50,
         'table': 48,
         'are': 48,
         'yes': 48,
         'for': 47,
         'or': 45,
         'levels': 39,
         'only': 38,
         'by': 36,
         'curse': 35,
         'as': 34,
         'first': 34,
         'that': 33,
         'damage': 33,
         'v': 31,
         'edition': 29,
         'mending': 29,
         'fire': 29,
         'w': 29,
         'experience': 28,
         'unbreaking': 28,
         'if': 26,
         'from': 26,
         'book': 26,
         'iv': 26,
         'ad

In [7]:
import json, os

if os.path.exists('word_counts.json'):
    with open('word_counts.json', 'r', encoding='utf-8') as f:
        old_json = json.load(f)
        old_counter = Counter(old_json)
else:
    old_counter = Counter()

old_counter.update(counter)

with open('word_counts.json', 'w', encoding='utf-8') as f:
    json.dump(counter, f, ensure_ascii=False, indent=4)



In [8]:
from wordfreq import word_frequency, top_n_list
word_frequency('the', 'en')
top_n_list('en', 10)

['the', 'to', 'and', 'of', 'a', 'in', 'i', 'is', 'for', 'that']

In [9]:

normalized_counter = {}

for k, v in counter.items():
    normalized_counter[k] = v / counter.total()

print(counter)
print(normalized_counter)

Counter({'the': 289, 'of': 170, 'and': 126, 'to': 113, 'a': 101, 'an': 88, 'enchantments': 86, 'level': 86, 'item': 85, 'enchantment': 84, 'be': 77, 'iii': 74, 'in': 73, 'enchanted': 71, 'enchanting': 63, 'is': 59, 'on': 59, 'no': 59, 'can': 56, 'with': 56, 'items': 52, 'protection': 50, 'table': 48, 'are': 48, 'yes': 48, 'for': 47, 'or': 45, 'levels': 39, 'only': 38, 'by': 36, 'curse': 35, 'as': 34, 'first': 34, 'that': 33, 'damage': 33, 'v': 31, 'edition': 29, 'mending': 29, 'fire': 29, 'w': 29, 'experience': 28, 'unbreaking': 28, 'if': 26, 'from': 26, 'book': 26, 'iv': 26, 'added': 26, 'anvil': 25, 'player': 25, 'i': 25, 'using': 24, 'vanishing': 23, 'books': 22, 'increases': 22, 'adds': 22, 'arthropods': 21, 'now': 21, 's': 20, 'ii': 20, 'bane': 20, 'smite': 20, 'has': 19, 'speed': 19, 'sharpness': 19, 'enchant': 18, 'this': 18, 'cost': 18, 'when': 18, 'efficiency': 18, 'knockback': 18, 'reduces': 18, 'infinity': 18, 'treated': 18, 'armor': 17, 'maximum': 17, 'thorns': 17, 'it': 16

In [10]:
df = pd.DataFrame.from_dict(normalized_counter, orient='index', columns=['Wiki Frequency'])

df['Language Frequency'] = df.index.map(lambda word: word_frequency(word, 'en'))


df


Unnamed: 0,Wiki Frequency,Language Frequency
enchant,0.002890,3.720000e-07
redirects,0.000482,4.470000e-07
here,0.000642,9.330000e-04
for,0.007545,1.020000e-02
enchanting,0.010114,1.410000e-06
...,...,...
tooltips,0.000161,5.750000e-08
brewing,0.000161,5.750000e-06
getting,0.000161,3.720000e-04
started,0.000161,2.450000e-04


In [11]:
df_top = pd.DataFrame(top_n_list('en', 5), columns=['Word'])
df_top.set_index('Word', inplace=True)
df_top['Language Frequency'] = df_top.index.map(lambda word: word_frequency(word, 'en'))

df_top

Unnamed: 0_level_0,Language Frequency
Word,Unnamed: 1_level_1
the,0.0537
to,0.0269
and,0.0257
of,0.0251
a,0.0229


In [14]:
url_prefix = "https://minecraft.wiki"
links = soup.find_all('a')

found_links = {
        url_prefix + link.get('href')
        for link in soup.find_all('a')
        if link.get('href', '').startswith('/w/')
    }

found_links


{'https://minecraft.wiki/w/1.13',
 'https://minecraft.wiki/w/Air',
 'https://minecraft.wiki/w/Ancient_cities',
 'https://minecraft.wiki/w/Anvil',
 'https://minecraft.wiki/w/Anvil_mechanics',
 'https://minecraft.wiki/w/Aqua_Affinity',
 'https://minecraft.wiki/w/Armor',
 'https://minecraft.wiki/w/Arrow',
 'https://minecraft.wiki/w/Arthropod',
 'https://minecraft.wiki/w/Axe',
 'https://minecraft.wiki/w/Bane_of_Arthropods',
 'https://minecraft.wiki/w/Bartering',
 'https://minecraft.wiki/w/Bastion_remnant',
 'https://minecraft.wiki/w/Bedrock_Edition',
 'https://minecraft.wiki/w/Bedrock_Edition_1.10.0',
 'https://minecraft.wiki/w/Bedrock_Edition_1.16.0',
 'https://minecraft.wiki/w/Bedrock_Edition_1.19.0',
 'https://minecraft.wiki/w/Bedrock_Edition_1.2.13',
 'https://minecraft.wiki/w/Bedrock_Edition_1.21.0',
 'https://minecraft.wiki/w/Bedrock_Edition_1.21.130',
 'https://minecraft.wiki/w/Bedrock_Edition_1.21.132',
 'https://minecraft.wiki/w/Bedrock_Edition_1.21.30',
 'https://minecraft.wiki/w