<h1>Analiza częstotliwości występowania słów w kilku językach i tekstach</h1>

In [126]:
def lang_confidence_score(word_counts, language_words_with_frequency):
    sum_squares_art = 0
    sum_squares_lang = 0
    dot_product = 0
    for word, count in word_counts.items():
        if word in language_words_with_frequency.keys():
            count_lang = language_words_with_frequency[word]
        else:
            count_lang = 0

        sum_squares_art += count ** 2
        sum_squares_lang += count_lang ** 2
        dot_product += count * count_lang

    return dot_product ** 2 / (sum_squares_art * sum_squares_lang)

In [135]:
import re
import json
import os
import math
from types import SimpleNamespace
from wordfreq import word_frequency, top_n_list

# Check if wiki_scraper exists, otherwise mock it for safety
try:
    import wiki_scraper
except ImportError:
    wiki_scraper = None
    print("Warning: 'wiki_scraper' module not found. Skipping web scraping.")

word_counts = {}
languages = ["en", "pl", "fr"]  # Fixed: List instead of string

# 1. Handle Wookieepedia Scraper
if wiki_scraper:
    args = SimpleNamespace(
        summary=None, table=None, number=None,
        count_words="Obi-wan Kenobi", analyze_relative_word_frequency=None,
        mode=None, count=None, chart=None, auto_count_words=None,
        depth=None, wait=None
    )

    file_path = "word-counts.json"
    # Ensure fresh start
    if os.path.exists(file_path):
        os.remove(file_path)

    # Run manager
    try:
        manager = wiki_scraper.Manager(args)
        manager.action()

        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                # Ensure the key exists before accessing
                word_counts["obi-wan"] = data.get("words", {})
    except Exception as e:
        print(f"Error running wiki_scraper: {e}")

# 2. Process Local Text Files
for lang in languages:
    path = f"texts/{lang}.txt"
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()
            # Regex to find words, ignoring punctuation
            words = re.findall(r'\b\w+\b', content.lower(), flags=re.UNICODE)

            counts = {}
            for word in words:
                counts[word] = counts.get(word, 0) + 1
            word_counts[lang] = counts
    else:
        print(f"Warning: File '{path}' not found. Using empty data.")
        word_counts[lang] = {}

# 3. Analyze Against Language Models
print(f"\n{'K':<6} {'Ref Lang':<10} {'Text Source':<15} {'Consistency Score'}")
print("-" * 50)

for k in [3, 10, 100, 1000]:
    # Pre-calculate reference frequencies for all target languages for this K
    # Fixed: Dictionary comprehension syntax
    words_lang_ref = {
        lang: {word: word_frequency(word, lang) for word in top_n_list(lang, k)}
        for lang in languages
    }

    for ref_lang in languages:
        ref_data = words_lang_ref[ref_lang]

        for text_source, text_data in word_counts.items():
            # Skip empty datasets
            if not text_data:
                continue

            consistency = lang_confidence_score(text_data, ref_data)

            # Print format: K, Reference Language, Text Name, Score
            print(f"{k:<6} {ref_lang:<10} {text_source:<15} {consistency:.5f}")

Content from Wookieepedia: Obi-wan Kenobi
Source: https://starwars.fandom.com/wiki/Obi-wan_Kenobi
License: CC BY-SA 3.0

K      Ref Lang   Text Source     Consistency Score
--------------------------------------------------
3      en         obi-wan         0.64666
3      en         en              0.48941
3      en         pl              0.02245
3      en         fr              0.00000
3      pl         obi-wan         0.00032
3      pl         en              0.00082
3      pl         pl              0.36295
3      pl         fr              0.00000
3      fr         obi-wan         0.00000
3      fr         en              0.00000
3      fr         pl              0.00000
3      fr         fr              0.34535
10     en         obi-wan         0.68676
10     en         en              0.70691
10     en         pl              0.09274
10     en         fr              0.00022
10     pl         obi-wan         0.05710
10     pl         en              0.01317
10     pl         pl