In [5]:
!pip install transformers pandas openpyxl torch

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   ------- -------------------------------- 1.8/10.4 MB 7.2 MB/s eta 0:00:02
   ------------------------------- -------- 8.1/10.4 MB 20.2 MB/s eta 0:00:01
   ---------------------------------------- 10.4/10.4 MB 18.0 MB/s eta 0:00:00
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading safetensors-0.5.3-c


[notice] A new release of pip is available: 24.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from transformers import BertForMaskedLM, BertTokenizer
import torch
import pandas as pd

# Загрузка KazBERT
model_name = "cimm-kz/KazBERT"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

OSError: cimm-kz/KazBERT is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
def fix_word_with_kazbert(word, threshold=0.5):
    if "?" not in word:
        return word
    
    # Заменяем "?" на [MASK]
    masked_word = word.replace("?", "[MASK]")
    
    # Токенизация
    inputs = tokenizer(masked_word, return_tensors="pt").to(device)
    
    # Предсказание масок
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Получаем предсказанные токены
    mask_indices = [i for i, token in enumerate(inputs["input_ids"][0]) 
                   if token == tokenizer.mask_token_id]
    
    predicted_tokens = []
    for mask_idx in mask_indices:
        logits = outputs.logits[0, mask_idx]
        probs = torch.softmax(logits, dim=-1)
        top_k = torch.topk(probs, 5)
        
        # Выбираем токен с высокой уверенностью
        for token_id, prob in zip(top_k.indices, top_k.values):
            token = tokenizer.decode(token_id)
            if prob > threshold and token not in ["", "[UNK]"]:
                predicted_tokens.append(token)
                break
    
    # Собираем исправленное слово
    corrected_word = list(word)
    for i, token in zip(mask_indices, predicted_tokens):
        corrected_word[i - 1] = token  # -1 потому что BERT добавляет [CLS]
    
    return "".join(corrected_word).replace("##", "")

# Пример работы
test_word = "С?б?з"
print(f"Исходное: {test_word} → Исправленное: {fix_word_with_kazbert(test_word)}")

In [7]:
# Загрузка данных
df = pd.read_excel("General_legal.xlsx")  # Столбец "word"

# Исправление всех слов
df["corrected_word"] = df["word"].apply(fix_word_with_kazbert)

# Сохранение
df.to_excel("fixed_kazakh_words.xlsx", index=False)
print("Готово! Исправленные слова сохранены в fixed_kazakh_words.xlsx")

KeyError: 'word'

In [4]:
# Установка библиотек (если нужно)
!pip install python-Levenshtein fuzzywuzzy

# Импорт
from fuzzywuzzy import fuzz

# Пример данных (замените своими)
kazakh_words = ["улица.", "Абай", "Тәуелсіздік", "Құрмет", "Өтеміс"]
russian_words = ["улица", "Абая", "Независимости", "Курмет", "Отемис"]

# Вычисление процента схожести для каждой пары
for kz, ru in zip(kazakh_words, russian_words):
    similarity = fuzz.ratio(kz, ru)
    print(f"{kz} ↔ {ru}: {similarity}%")

# Средний процент
avg_similarity = sum(fuzz.ratio(kz, ru) for kz, ru in zip(kazakh_words, russian_words)) / len(kazakh_words)
print(f"\nСредний процент схожести: {avg_similarity:.1f}%")

улица. ↔ улица: 91%
Абай ↔ Абая: 75%
Тәуелсіздік ↔ Независимости: 17%
Құрмет ↔ Курмет: 67%
Өтеміс ↔ Отемис: 67%

Средний процент схожести: 63.4%



[notice] A new release of pip is available: 24.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
