# Model for correcting misspellea words

This part of the notebook was taken from https://github.com/deepmipt/raai_summer_school_nlp_2021. In this part, you can find the demonstration of how to work with transformers

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 6.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 52.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     

В современной компьютерной лингвистике вычисление вероятности текста производится в основном за счёт нейронных, а не энграммных моделей. Существует много разновидностей архитектур, мы рассмотрим 2: левостороннюю модель  `gpt2` (её облегчённую версию `distilgpt2`) и языковую модель с пропусками `BERT`.

## Односторонние языковые модели

Вначале создадим токенизатор и применим его к данным

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [3]:
text = "The Starship prototype descended under active aerodynamic control, accomplished by four vehicles."
tokenization = tokenizer(text)
print(tokenization)

{'input_ids': [464, 40172, 14879, 23667, 739, 4075, 9551, 34743, 1630, 11, 13013, 416, 1440, 5672, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
sents = [
    "Yesterday, all my troubles seemed so far away.",
    "I only want to say, if there is a way, take away this cup of poison, 'cause it burns me.",
    "We do not need your education, we do not need your thought control.",
    "When the light begins to change, I sometimes feel a little strange, a little anxious when it's dark."         
]
for elem in tokenizer(sents)["input_ids"]:
    print(elem)

[28065, 11, 477, 616, 14979, 3947, 523, 1290, 1497, 13]
[40, 691, 765, 284, 910, 11, 611, 612, 318, 257, 835, 11, 1011, 1497, 428, 6508, 286, 8764, 11, 705, 25587, 340, 20246, 502, 13]
[1135, 466, 407, 761, 534, 3707, 11, 356, 466, 407, 761, 534, 1807, 1630, 13]
[2215, 262, 1657, 6140, 284, 1487, 11, 314, 3360, 1254, 257, 1310, 6283, 11, 257, 1310, 18116, 618, 340, 338, 3223, 13]


In [5]:
tokenizer.pad_token = tokenizer.eos_token
for elem in tokenizer(sents, return_tensors="pt", padding=True)["input_ids"]:
    print(elem)

tensor([28065,    11,   477,   616, 14979,  3947,   523,  1290,  1497,    13,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256])
tensor([   40,   691,   765,   284,   910,    11,   611,   612,   318,   257,
          835,    11,  1011,  1497,   428,  6508,   286,  8764,    11,   705,
        25587,   340, 20246,   502,    13])
tensor([ 1135,   466,   407,   761,   534,  3707,    11,   356,   466,   407,
          761,   534,  1807,  1630,    13, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256])
tensor([ 2215,   262,  1657,  6140,   284,  1487,    11,   314,  3360,  1254,
          257,  1310,  6283,    11,   257,  1310, 18116,   618,   340,   338,
         3223,    13, 50256, 50256, 50256])


 Загрузим модель на видеокарту.

In [6]:
from transformers import AutoModelWithLMHead, AutoModel

model = AutoModelWithLMHead.from_pretrained("gpt2").to("cuda")
type(model)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [20]:
import torch
from torch import LongTensor

Теперь посмотрим, насколько модель знает грамматику.

In [22]:
 texts = [
     "Alexandra is very proud of herself.", "Alexandra is very proud of himself.",
     "Alexander is very proud of herself.", "Alexander is very proud of himself.",
     "Alexandra is very proud of she.", "Alexandra is very proud of her.",
     "Alexandra is very proud of her son."
 ]
 tokenizer.pad_token = tokenizer.eos_token
 batch = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
 # добавляем индекс начала строки (склейка массивов по первой координате)
 batch["input_ids"] = torch.cat([
    torch.ones_like(batch["input_ids"][:,:1])*tokenizer.bos_token_id, 
    batch["input_ids"]
 ], dim=1)
#  batch["attention_mask"] = torch.cat([
#     torch.ones_like(batch["attention_mask"][:,:1]),
#     batch["attention_mask"]
#  ], dim=-1)
 with torch.no_grad():
    logits = model(batch["input_ids"])["logits"]
 probs = torch.softmax(logits, dim=-1).cpu().numpy()
 print(probs.shape)

(7, 10, 50257)


In [23]:
for i, text in enumerate(texts):
    print(text)
    text_token_ids = batch["input_ids"][i,1:]
    text_tokens = [x.strip("ĠĊ") for x in tokenizer.convert_ids_to_tokens(text_token_ids)]
    for j, (index, token) in enumerate(zip(text_token_ids, text_tokens)):
        print(f"{token}:{probs[i,j,index]:.3f}", end=" ")
    print("")

Alexandra is very proud of herself.
Alex:0.000 andra:0.050 is:0.014 very:0.004 proud:0.028 of:0.579 herself:0.040 .:0.215 <|endoftext|>:0.002 
Alexandra is very proud of himself.
Alex:0.000 andra:0.050 is:0.014 very:0.004 proud:0.028 of:0.579 himself:0.002 .:0.213 <|endoftext|>:0.002 
Alexander is very proud of herself.
Alexander:0.000 is:0.011 very:0.004 proud:0.030 of:0.679 herself:0.001 .:0.205 <|endoftext|>:0.002 <|endoftext|>:0.000 
Alexander is very proud of himself.
Alexander:0.000 is:0.011 very:0.004 proud:0.030 of:0.679 himself:0.020 .:0.199 <|endoftext|>:0.002 <|endoftext|>:0.000 
Alexandra is very proud of she.
Alex:0.000 andra:0.050 is:0.014 very:0.004 proud:0.028 of:0.579 she:0.001 .:0.002 <|endoftext|>:0.008 
Alexandra is very proud of her.
Alex:0.000 andra:0.050 is:0.014 very:0.004 proud:0.028 of:0.579 her:0.356 .:0.005 <|endoftext|>:0.003 
Alexandra is very proud of her son.
Alex:0.000 andra:0.050 is:0.014 very:0.004 proud:0.028 of:0.579 her:0.356 son:0.009 .:0.165 


## Writing model for correcting misspelled words

Let`s load library with English dictionary


In [10]:
!apt install -qq enchant
!pip install pyenchant

The following additional packages will be installed:
  aspell aspell-en dictionaries-common emacsen-common hunspell-en-us
  libaspell15 libenchant1c2a libhunspell-1.6-0 libtext-iconv-perl
Suggested packages:
  aspell-doc spellutils wordlist hunspell openoffice.org-hunspell
  | openoffice.org-core libenchant-voikko
The following NEW packages will be installed:
  aspell aspell-en dictionaries-common emacsen-common enchant hunspell-en-us
  libaspell15 libenchant1c2a libhunspell-1.6-0 libtext-iconv-perl
0 upgraded, 10 newly installed, 0 to remove and 39 not upgraded.
Need to get 1,310 kB of archives.
After this operation, 5,353 kB of additional disk space will be used.
Preconfiguring packages ...
Selecting previously unselected package libtext-iconv-perl.
(Reading database ... 160837 files and directories currently installed.)
Preparing to unpack .../0-libtext-iconv-perl_1.7-5build6_amd64.deb ...
Unpacking libtext-iconv-perl (1.7-5build6) ...
Selecting previously unselected package libaspe

Here we write the function for finding misspelled words. If you want to correct not only misprint but also grammatical errors you should comment string (`if not d.check(word_list[i]):`). Unfortunately if you use the free version of colab commenting on the string lead to 'RUNTIME ERROR'

In [11]:
import enchant
#function for finding misspeled word in sentence
def find_misspeled(sentance):
  word_list = sentance.split(' ')
  misspelled_words = dict()
  #print(len(word_list))
  for i in range(len(word_list)):
    if not d.check(word_list[i]):
      misspelled_words[word_list[i]] = i
  return misspelled_words

In [12]:
#check fuction
### check the useful function in package enchant
#load english dictionary
d = enchant.Dict("en_US")
#find misspelled word
find_misspeled('He is intelligen')

{'intelligen': 2}

In [13]:
#generate possible words
d.suggest('intelligen')

['intelligent', 'intelligence', 'intelligible', 'intelligibly', 'belligerent']

Here we write a function for generating sentences with possible correct words

In [14]:
def generate_correction(sentance):
  sentance_list = [sentance]
  possible_correction_list = []
  word_dict = find_misspeled(sentance)
  
  for word in word_dict.keys():
    for sent in sentance_list:
      word_list = sent.split(' ')
      for correction_word in d.suggest(word):
        new_sentance = ' '.join(word_list[:word_dict[word]] + [correction_word] + word_list[word_dict[word]+1:])
        possible_correction_list  += [new_sentance]
    #print(sentance_list, word)
    sentance_list = possible_correction_list
    possible_correction_list = []
    #print(sentance_list)
  return sentance_list

Let`s look at the examples

In [15]:
 #check function
 generate_correction('He is intelligen')

['He is intelligent',
 'He is intelligence',
 'He is intelligible',
 'He is intelligibly',
 'He is belligerent']

In [16]:
#check function
generate_correction('He liks intelligen peaple')

['He leeks intelligent people',
 'He leeks intelligent Peale',
 'He leeks intelligent leaper',
 'He leeks intelligent apple',
 'He leeks intelligent appeal',
 'He leeks intelligence people',
 'He leeks intelligence Peale',
 'He leeks intelligence leaper',
 'He leeks intelligence apple',
 'He leeks intelligence appeal',
 'He leeks intelligible people',
 'He leeks intelligible Peale',
 'He leeks intelligible leaper',
 'He leeks intelligible apple',
 'He leeks intelligible appeal',
 'He leeks intelligibly people',
 'He leeks intelligibly Peale',
 'He leeks intelligibly leaper',
 'He leeks intelligibly apple',
 'He leeks intelligibly appeal',
 'He leeks belligerent people',
 'He leeks belligerent Peale',
 'He leeks belligerent leaper',
 'He leeks belligerent apple',
 'He leeks belligerent appeal',
 'He ilks intelligent people',
 'He ilks intelligent Peale',
 'He ilks intelligent leaper',
 'He ilks intelligent apple',
 'He ilks intelligent appeal',
 'He ilks intelligence people',
 'He ilks 

In [24]:
 texts = generate_correction('He liks intelligen peaple')
 tokenizer.pad_token = tokenizer.eos_token
 batch = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
 # добавляем индекс начала строки (склейка массивов по первой координате)
 batch["input_ids"] = torch.cat([
    torch.ones_like(batch["input_ids"][:,:1])*tokenizer.bos_token_id, 
    batch["input_ids"]
 ], dim=1)
#  batch["attention_mask"] = torch.cat([
#     torch.ones_like(batch["attention_mask"][:,:1]),
#     batch["attention_mask"]
#  ], dim=-1)
 with torch.no_grad():
    logits = model(batch["input_ids"])["logits"]
 probs = torch.softmax(logits, dim=-1).cpu().numpy()
 print(probs.shape)

(250, 9, 50257)


In [27]:
import math

In [28]:
 texts = generate_correction('He is intelligen')
 tokenizer.pad_token = tokenizer.eos_token
 batch = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
 # добавляем индекс начала строки (склейка массивов по первой координате)
 batch["input_ids"] = torch.cat([
    torch.ones_like(batch["input_ids"][:,:1])*tokenizer.bos_token_id, 
    batch["input_ids"]
 ], dim=1)
#  batch["attention_mask"] = torch.cat([
#     torch.ones_like(batch["attention_mask"][:,:1]),
#     batch["attention_mask"]
#  ], dim=-1)
 with torch.no_grad():
    logits = model(batch["input_ids"])["logits"]
 probs = torch.softmax(logits, dim=-1).cpu().numpy()
 print(probs.shape)
 
 for i, text in enumerate(texts):
    print(text)
    text_token_ids = batch["input_ids"][i,1:]
    text_tokens = [x.strip("ĠĊ") for x in tokenizer.convert_ids_to_tokens(text_token_ids)]
    for j, (index, token) in enumerate(zip(text_token_ids, text_tokens)):
        if token != '<|endoftext|>':
          print(f"{token}:{math.log10(probs[i,j,index]):.3f}", end=" ")
    print("")

(5, 5, 50257)
He is intelligent
He:-2.584 is:-1.320 intelligent:-4.069 
He is intelligence
He:-2.584 is:-1.320 intelligence:-5.639 
He is intelligible
He:-2.584 is:-1.320 intellig:-6.183 ible:-1.429 
He is intelligibly
He:-2.584 is:-1.320 intellig:-6.183 ibly:-2.365 
He is belligerent
He:-2.584 is:-1.320 bellig:-5.047 erent:-0.000 


Here we write code that for each subtokens in a sentence shows its logarithm of possibility to appear in the sentence

In [29]:

for i, text in enumerate(texts):
    print(text)
    text_token_ids = batch["input_ids"][i,1:]
    text_tokens = [x.strip("ĠĊ") for x in tokenizer.convert_ids_to_tokens(text_token_ids)]
    for j, (index, token) in enumerate(zip(text_token_ids, text_tokens)):
        if token != '<|endoftext|>':
          print(f"{token}:{math.log10(probs[i,j,index]):.3f}", end=" ")
    print("")

He is intelligent
He:-2.584 is:-1.320 intelligent:-4.069 
He is intelligence
He:-2.584 is:-1.320 intelligence:-5.639 
He is intelligible
He:-2.584 is:-1.320 intellig:-6.183 ible:-1.429 
He is intelligibly
He:-2.584 is:-1.320 intellig:-6.183 ibly:-2.365 
He is belligerent
He:-2.584 is:-1.320 bellig:-5.047 erent:-0.000 


So to find our correct sentance we should sum all logarithm of the possibility of each subtokens unless '<|endoftext|>' and find sentences with the max possibility

In [30]:
def generate_possible_sentence(sentance, best_three = False):
  texts = generate_correction(sentance)
  tokenizer.pad_token = tokenizer.eos_token
  batch = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
  # добавляем индекс начала строки (склейка массивов по первой координате)
  batch["input_ids"] = torch.cat([
      torch.ones_like(batch["input_ids"][:,:1])*tokenizer.bos_token_id, 
      batch["input_ids"]
  ], dim=1)
  #  batch["attention_mask"] = torch.cat([
  #     torch.ones_like(batch["attention_mask"][:,:1]),
  #     batch["attention_mask"]
  #  ], dim=-1)
  with torch.no_grad():
      logits = model(batch["input_ids"])["logits"]
  probs = torch.softmax(logits, dim=-1).cpu().numpy()
  #print(probs.shape)
  sentence_probability = dict()
  for i, text in enumerate(texts):
      #print(text)
      s = 1
      
      text_token_ids = batch["input_ids"][i,1:]
      text_tokens = [x.strip("ĠĊ") for x in tokenizer.convert_ids_to_tokens(text_token_ids)]
      for j, (index, token) in enumerate(zip(text_token_ids, text_tokens)):
          if token != '<|endoftext|>':
            #print(probs[i,j,index], j)
            s *= probs[i,j,index]
      sentence_probability[text] = s
      #print(s)
      #print("")
  markdict = sentence_probability
  marklist = sorted(markdict.items(), key=lambda x:x[1], reverse=True)
  sortdict = dict(marklist)
  a = list(sortdict.keys())
  print(a[0], math.log10(sortdict[a[0]]))
  if best_three:
    if len(a) > 1:
      print(a[1], math.log10(sortdict[a[1]]))
    if len(a) > 2:
      print(a[2], math.log10(sortdict[a[2]]))

let's look on example

In [31]:
generate_possible_sentence('He is intelligen')

He is intelligent -7.973009680687585


In [32]:
generate_possible_sentence('He liks intelligen peapel')

He likes intelligent appeal -14.8900343748022


In [None]:
generate_possible_sentence('He likes intelligent peapel. They are really smart')

In [None]:
generate_possible_sentence('He was very hard person. H makes people angry')

In [None]:
generate_possible_sentence('He was very hard person. Hi makes people angry')

the number means the logarithm of the probability of the given right to offer

let's look on example of best three sentances

In [33]:
generate_possible_sentence('He liks intelligen poeple', best_three = True)

He likes intelligent people -10.859718182047587
He like intelligent people -12.039852992837014
He likes intelligence people -12.702763681423061


In [34]:
generate_possible_sentence('Nice to mete you')

Nice to mete you -14.887436033400595


In [35]:
generate_possible_sentence('Nice to ment you', best_three = True)

Nice to meet you -6.23056891547142
Nice to met you -9.22511285645124
Nice to sent you -11.983995502819349


In [36]:
generate_possible_sentence('Where are you from')

Where are you from -5.822051746623529


In [37]:
generate_possible_sentence('Where are you frm', best_three = True)

Where are you from -5.822049168133126
Where are you fro -9.984711659688703
Where are you fr -10.156299643015004


In [60]:
generate_possible_sentence('Where are youfrom', best_three = True)

Where are you from -5.822044339177939
Where are you-from -11.70946646505008
Where are froufrou -13.452859386399195


In [64]:
generate_possible_sentence('Excuseme, sir, you dropped your wallet', best_three = True)

Excused sir you dropped your wallet -20.51674264270373
Excused Sir you dropped your wallet -21.25309790472316
Excused sirs you dropped your wallet -22.079111641811714


In [38]:
generate_possible_sentence('H likes Mary', best_three = True)

H likes Mary -12.756452390134115


In [63]:
generate_possible_sentence('Helikes Mary', best_three = True)

Likes Mary -9.066864499924055
He likes Mary -9.947937511564978
Helices Mary -13.30369145744321


In [None]:
generate_possible_sentence('Helikes Mary', best_three = True)

In [57]:
generate_possible_sentence('Wher are you from')

Where are you from -5.822059694891647


In [59]:
generate_possible_sentence('Where are yu from')

Where are you from -5.822052646721279


In [58]:
generate_possible_sentence('Wher is you from')

Where is you from -8.00890321694338


In [56]:
generate_possible_sentence('Wher ar you from')

Where are you from -5.822054530763859


In [55]:
generate_possible_sentence('Excuce me, sar, you dropped your wallet')

Excuse me Sara you dropped your wallet -22.20794236369435


In [54]:
generate_possible_sentence('I dod not invit you to the pary.')

I did not invite you to the party -11.473779448857929


In [65]:
generate_possible_sentence('Idid not invit you to the pary.')

Did not invite you to the party -13.3637631279328


In [66]:
generate_possible_sentence('I didnot invit you to the pary.')

I didn't invite you to the party -10.971554858634994


In [67]:
generate_possible_sentence('I did not inviteyou to the pary.')

I did not invitee to the party -16.466447650841918


In [68]:
generate_possible_sentence('I did not invite you to theparty.')

I did not invite you to the party -11.47376431090114


In [69]:
generate_possible_sentence('I did not invite you tothe party.')

I did not invite you to the party. -12.056345813394588


In [71]:
generate_possible_sentence('I m reallysorry')

I m really sorry -10.48647205781006


In [39]:
generate_possible_sentence('Im really sory')

I'm really sorry -5.891609285081461


In [52]:
generate_possible_sentence('Wht do youthink?')

What do outhitting -13.863678994758327


In [50]:
generate_possible_sentence('Oh, never mins')

Oh never mind -7.874431643460552


In [51]:
generate_possible_sentence('Oh, never mins')

Oh never mind -7.874431643460552


If you have this error it means that there is no enough  memory on colab and you should restart everything:(

In [49]:
generate_possible_sentence('Im Masha')

In Mash -7.192526356863372


In [40]:
generate_possible_sentence('Im Jack')

In Jack -6.849314044971794


In [41]:
generate_possible_sentence('Im Jack. I have a son')

I'm Jack. I have a son -13.190372789967247


In [48]:
generate_possible_sentence('Im Pol. I have a son')

I'm Pol. I have a son -16.118025857377518


In [47]:
generate_possible_sentence('Im seven years old')

I'm seven years old -7.590789444221681


In [46]:
generate_possible_sentence('Thaks so muc for the birthdays money.')

Thanks so much for the birthdays money. -17.193170857781922


In [45]:
generate_possible_sentence('I really appreiate yor help.')

I really appreciate your help. -8.536285601209237


GPT2 does not know my name

In [42]:
generate_possible_sentence('I am Masha')

I am Tasha -8.588813247887964


In [43]:
generate_possible_sentence('I am Maria')

I am Maria -7.988472099562584


In [44]:
generate_possible_sentence('Input arrays to be multiplid.')


Input arrays to be multiplied -15.16632472214808
