In [5]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [27]:
def unicodeToAscii(s):
    excluded_chars = {'й','Й','ё','Ё'}  # Characters to exclude from normalization
    return ''.join(
        c if c in excluded_chars else unicodedata.normalize('NFD', c)
        for c in s
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)  # Including em dash as a punctuation mark
    s = re.sub(r"[^а-яҒғӘәҮүҺһҪҫҘҙҢңҠҡӨө!?]+", r" ", s)  # Using Cyrillic and Bashkir alphabets ranges and retaining periods and spaces
    return s.strip()

# Example usage:
input_string = "Это буквы: Ә ә, Ө ө, Ү ү ( гласные ); Ҙ ҙ, Һ һ, Ҫ ҫ, Ң ң, Ҡ ҡ, Ғ ғ ( согласные )."
normalized_string = normalizeString(input_string)
print(normalized_string)

это буквы ә ә ө ө ү ү гласные ҙ ҙ һ һ ҫ ҫ ң ң ҡ ҡ ғ ғ согласные


In [None]:
lines = open('text.txt', encoding='utf-8').\
        read().strip().split('\n')
pairs = []

for n in range(0,len(lines), 3):
    pairs.append([normalizeString(lines[n]),normalizeString(lines[n+1])])
print(pairs[-3:])

In [40]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('text.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = []

    for n in range(0,len(lines), 3):
        pairs.append([normalizeString(lines[n]),normalizeString(lines[n+1])])
        
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs