In [1]:
import http.client, urllib.parse, json, string, pickle
from time import sleep

import pandas as pd
import numpy as np

from tqdm import tqdm

np.random.seed(42)

In [2]:
def read_data(datapath):
    n_errors = 0
    was_error = False

    with open(datapath) as f:
        xy_list = list()
        tokens = list()
        tags = list()
        
        for line in f:
            # format error handling:
            if was_error:
                if line != '\n':
                    continue
                else:
                    was_error = False

            items = line.strip('\n').strip('\t').split('\t')
            if len(items) == 2:
                token, tag = items
                if token[0].isdigit():
                    tokens.append('#')
                else:
                    # is you want static noise, add it here
                    tokens.append(token.lower())
                tags.append(tag)
            elif line == '\n':
                xy_list.append((tokens, tags,))
                tokens = list()
                tags = list()
            else:
                # format error handling
                print(line)
                n_errors += 1
                was_error = True
                continue
    if n_errors > 0:
        print('Reading is done with {} errors'.format(n_errors))
    return xy_list


In [3]:
def argmax_suggestion(suggestions):
    argmax = 0
    for i, s in enumerate(suggestions):
#         print(s)
        if s['score'] > suggestions[argmax]['score']:
            argmax = i
    return argmax

def spellsafe(text, spellchecker_output):
    assert '_type' in spellchecker_output, spellchecker_output
    assert spellchecker_output['_type'] == 'SpellCheck', spellchecker_output

    result = text
    changes = {}
    for token_info in spellchecker_output['flaggedTokens']:
        
        argmax = argmax_suggestion(token_info['suggestions'])
        suggestion = token_info['suggestions'][argmax]['suggestion']
        if ' ' in suggestion or ' ' in token_info['token']:
#         if len(suggestion.split(' ')) > 1:
            print('token: ', token_info['token'], '\tsuggest: ', suggestion)
            continue
        token = token_info['token'].strip(' ').lstrip(' ')
        changes[token] = suggestion
    
    for wrong, fixed in changes.items():
        result = result.replace(wrong, fixed)
    return result

def untokenize(tokens):
    text = ''
    was_apostrophe = False
    for token in tokens:
        if token == "'":
            was_apostrophe = True
            text += token
        elif token in string.punctuation:
            was_apostrophe = False
            text += token
        elif was_apostrophe:
            was_apostrophe = False
            text += token
        else:
            was_apostrophe = False
            text += ' ' + token
    text = text.lstrip(' ')
    return text

In [69]:
text = 'Solut à tout!'

data = {'text': text}

# NOTE: Replace this example key with a valid subscription key.
key = '7559af8fffdd432da93328d18b20e85a'

host = 'api.cognitive.microsoft.com'
path = '/bing/v7.0/spellcheck?'
params = 'mkt=fr-fr&mode=spell'

headers = {'Ocp-Apim-Subscription-Key': key,
'Content-Type': 'application/x-www-form-urlencoded'}

# The headers in the following example 
# are optional but should be considered as required:
#
# X-MSEdge-ClientIP: 999.999.999.999  
# X-Search-Location: lat: +90.0000000000000;long: 00.0000000000000;re:100.000000000000
# X-MSEdge-ClientID: <Client ID from Previous Response Goes Here>

conn = http.client.HTTPSConnection(host)
body = urllib.parse.urlencode(data)
conn.request ("POST", path + params, body, headers)
response = conn.getresponse()
output = json.loads(response.read())
print(output)

{'_type': 'SpellCheck', 'flaggedTokens': [{'offset': 0, 'token': 'Solut', 'type': 'UnknownToken', 'suggestions': [{'suggestion': 'salut', 'score': 1}]}], 'correctionType': 'High'}


In [63]:
for tokens, tags in tqdm(all_data):
    data = {'text': untokenize(tokens)}
    body = urllib.parse.urlencode(data)
    conn.request ("POST", path + params, body, headers)
    response = conn.getresponse()
    output = json.loads(response.read())

    fixed_text = spellsafe(' '.join(tokens), output)
    fixed_tokens = fixed_text.split(' ')
    assert len(fixed_tokens) == len(tags), output

    fixed.append((fixed_tokens, tags))
    sleep(1/100.)

  1%|          | 2/377 [00:00<00:57,  6.57it/s]

token:  rera 	suggest:  rer a
token:  qml #uxol75 	suggest:  qmluxol75
token:  rerb 	suggest:  rer b


  1%|          | 4/377 [00:00<00:55,  6.67it/s]

token:  #rer 	suggest:  rer b
token:   stationn 	suggest:  stationner
token:   cite 	suggest:  cite u


  2%|▏         | 9/377 [00:01<00:53,  6.88it/s]

token:  métro4 	suggest:  métro 4
token:  rera 	suggest:  rer a


  4%|▎         | 14/377 [00:02<00:56,  6.48it/s]

token:  ligne1 	suggest:  ligne 1
token:  rera 	suggest:  rer a


  5%|▍         | 18/377 [00:02<00:54,  6.61it/s]

token:  ligne14 	suggest:  ligne 14
token:  ligne13 	suggest:  ligne 13
token:  grouperatp 	suggest:  groupe ratp
token:  ligne10 	suggest:  ligne 10
token:  subwaypeople 	suggest:  subway people


  7%|▋         | 27/377 [00:04<00:53,  6.50it/s]

token:  rerb 	suggest:  rer b


  8%|▊         | 30/377 [00:04<00:52,  6.57it/s]

token:  weekend 	suggest:  week end


  8%|▊         | 31/377 [00:04<00:52,  6.55it/s]

token:  sncfmonopole 	suggest:  sncf monopole


 11%|█▏        | 43/377 [00:06<00:50,  6.56it/s]

token:  rerd 	suggest:  rer d


 12%|█▏        | 45/377 [00:06<00:50,  6.58it/s]

token:  rera 	suggest:  rer a
token:  rera 	suggest:  rer a


 13%|█▎        | 50/377 [00:07<00:49,  6.64it/s]

token:  rera 	suggest:  rer a


 15%|█▍        | 56/377 [00:08<00:48,  6.67it/s]

token:   circulation 	suggest:  circulation
token:  #laverrier 	suggest:  la verriere
token:  #ladefens 	suggest:  la défense


 16%|█▋        | 62/377 [00:09<00:46,  6.71it/s]

token:  rera 	suggest:  rer a


 17%|█▋        | 64/377 [00:09<00:46,  6.72it/s]

token:  lignej 	suggest:  ligne j


 19%|█▉        | 73/377 [00:10<00:45,  6.75it/s]

token:  rerb 	suggest:  rer b


 20%|█▉        | 75/377 [00:11<00:44,  6.74it/s]

token:  rera 	suggest:  rer a


 21%|██        | 79/377 [00:11<00:44,  6.77it/s]

token:  rera 	suggest:  rer a


 21%|██▏       | 81/377 [00:11<00:43,  6.79it/s]

token:  rerb 	suggest:  rer b


 22%|██▏       | 83/377 [00:12<00:43,  6.80it/s]

token:  #ligne 	suggest:  ligne j


 24%|██▍       | 90/377 [00:13<00:42,  6.75it/s]

token:  rerb 	suggest:  rer b


 25%|██▍       | 93/377 [00:13<00:41,  6.76it/s]

token:  rerb 	suggest:  rer b


 26%|██▌       | 97/377 [00:14<00:41,  6.77it/s]

token:   #re 	suggest:  rer b


 27%|██▋       | 102/377 [00:15<00:40,  6.78it/s]

token:  rera 	suggest:  rer a


 28%|██▊       | 105/377 [00:15<00:40,  6.80it/s]

token:  rera 	suggest:  rer a


 30%|██▉       | 113/377 [00:16<00:38,  6.78it/s]

token:  rerb 	suggest:  rer b
token:  saint-rémy 	suggest:  saint rémy


 32%|███▏      | 119/377 [00:17<00:38,  6.78it/s]

token:  rerb 	suggest:  rer b
token:  rerb 	suggest:  rer b


 32%|███▏      | 122/377 [00:17<00:37,  6.79it/s]

token:  rerb 	suggest:  rer b


 33%|███▎      | 123/377 [00:18<00:37,  6.79it/s]

token:   infotrafi 	suggest:  info trafic


 33%|███▎      | 125/377 [00:18<00:37,  6.75it/s]

token:  rerb 	suggest:  rer b
token:  pointgreve 	suggest:  point greve


 35%|███▍      | 131/377 [00:19<00:36,  6.72it/s]

token:  rera 	suggest:  rer a


 35%|███▌      | 133/377 [00:19<00:36,  6.72it/s]

token:   #re 	suggest:  rer d


 39%|███▉      | 147/377 [00:21<00:33,  6.78it/s]

token:  rerb 	suggest:  rer b


 41%|████      | 155/377 [00:22<00:32,  6.79it/s]

token:  rerc 	suggest:  rer c


 46%|████▋     | 175/377 [00:26<00:30,  6.70it/s]

token:  parismontparnasse 	suggest:  paris montparnasse


 47%|████▋     | 177/377 [00:26<00:29,  6.70it/s]

token:  lesgens 	suggest:  les gens


 48%|████▊     | 180/377 [00:26<00:29,  6.71it/s]

token:  rerd 	suggest:  rer d


 49%|████▉     | 185/377 [00:27<00:28,  6.69it/s]

token:  lignej 	suggest:  ligne j


 50%|████▉     | 187/377 [00:27<00:28,  6.70it/s]

token:  rerb 	suggest:  rer b


 52%|█████▏    | 195/377 [00:29<00:27,  6.69it/s]

token:  rerb 	suggest:  rer b
token:  rerb 	suggest:  rer b


 52%|█████▏    | 197/377 [00:29<00:26,  6.69it/s]

token:  rerb 	suggest:  rer b


 53%|█████▎    | 199/377 [00:29<00:26,  6.69it/s]

token:  rerb 	suggest:  rer b


 54%|█████▍    | 203/377 [00:30<00:26,  6.68it/s]

token:  rerc 	suggest:  rer c
token:  rerb 	suggest:  rer b


 54%|█████▍    | 205/377 [00:30<00:25,  6.69it/s]

token:  rerb 	suggest:  rer b


 56%|█████▌    | 211/377 [00:31<00:24,  6.71it/s]

token:  rerb 	suggest:  rer b
token:  stlazare 	suggest:  st lazare
token:  lignel 	suggest:  ligne l
token:  lignej 	suggest:  ligne j


 57%|█████▋    | 214/377 [00:31<00:24,  6.71it/s]

token:  rerb 	suggest:  rer b


 62%|██████▏   | 233/377 [00:34<00:21,  6.70it/s]

token:  rerb 	suggest:  rer b
token:  d' après 	suggest:  d'après
token:  rerb 	suggest:  rer b


 62%|██████▏   | 235/377 [00:35<00:21,  6.70it/s]

token:  rerc 	suggest:  rer c


 64%|██████▍   | 242/377 [00:36<00:20,  6.71it/s]

token:  rera 	suggest:  rer a
token:  lignej 	suggest:  ligne j


 65%|██████▍   | 245/377 [00:36<00:19,  6.72it/s]

token:  rera 	suggest:  rer a


 66%|██████▌   | 247/377 [00:36<00:19,  6.72it/s]

token:  rerc 	suggest:  rer c
token:  rerb 	suggest:  rer b


 67%|██████▋   | 251/377 [00:37<00:18,  6.71it/s]

token:  rerb 	suggest:  rer b
token:  stlazard 	suggest:  st lazare


 67%|██████▋   | 253/377 [00:37<00:18,  6.71it/s]

token:  rera 	suggest:  rer a


 68%|██████▊   | 255/377 [00:38<00:18,  6.69it/s]

token:  rerb 	suggest:  rer b
token:  valdefontenay 	suggest:  val de fontenay


 68%|██████▊   | 258/377 [00:38<00:17,  6.68it/s]

token:  rerb 	suggest:  rer b


 69%|██████▉   | 260/377 [00:38<00:17,  6.68it/s]

token:  rera 	suggest:  rer a


 70%|███████   | 264/377 [00:39<00:16,  6.68it/s]

token:  sudrail 	suggest:  sud rail
token:  saintlazare 	suggest:  saint lazare
token:  timothelefebvre 	suggest:  timothe lefebvre
token:  #tableronderat 	suggest:  table ronde ratp


 72%|███████▏  | 271/377 [00:40<00:15,  6.69it/s]

token:  rera 	suggest:  rer a


 74%|███████▎  | 278/377 [00:41<00:14,  6.70it/s]

token:  lignel 	suggest:  ligne l
token:  versaillesrd 	suggest:  versailles rd
token:  saintlazare 	suggest:  saint lazare
token:   l' heur 	suggest:  l'heure


 75%|███████▌  | 283/377 [00:42<00:14,  6.70it/s]

token:  rerb 	suggest:  rer b


 76%|███████▌  | 285/377 [00:42<00:13,  6.71it/s]

token:  rera 	suggest:  rer a


 77%|███████▋  | 289/377 [00:43<00:13,  6.71it/s]

token:  rera 	suggest:  rer a


 79%|███████▊  | 296/377 [00:44<00:12,  6.71it/s]

token:  rerb 	suggest:  rer b
token:  rerb 	suggest:  rer b


 81%|████████  | 304/377 [00:48<00:11,  6.24it/s]

token:  rera 	suggest:  rer a


 81%|████████▏ | 307/377 [00:49<00:11,  6.25it/s]

token:  rerb 	suggest:  rer b


 83%|████████▎ | 314/377 [00:50<00:10,  6.27it/s]

token:  rera 	suggest:  rer a
token:  rera 	suggest:  rer a


 84%|████████▍ | 316/377 [00:50<00:09,  6.27it/s]

token:  rera 	suggest:  rer a


 84%|████████▍ | 318/377 [00:50<00:09,  6.28it/s]

token:  @rer 	suggest:  rer a


 89%|████████▉ | 335/377 [00:53<00:06,  6.28it/s]

token:  rerb 	suggest:  rer b


 90%|████████▉ | 338/377 [00:53<00:06,  6.28it/s]

token:  rera 	suggest:  rer a


 91%|█████████ | 342/377 [00:54<00:05,  6.29it/s]

token:  infotrafic 	suggest:  info trafic
token:  infotrafic 	suggest:  info trafic


 92%|█████████▏| 346/377 [00:54<00:04,  6.30it/s]

token:  envrac 	suggest:  en vrac


 93%|█████████▎| 349/377 [00:55<00:04,  6.30it/s]

token:  rerb 	suggest:  rer b


 97%|█████████▋| 364/377 [00:57<00:02,  6.31it/s]

token:  rerb 	suggest:  rer b
token:  lignej 	suggest:  ligne j


 98%|█████████▊| 371/377 [00:58<00:00,  6.32it/s]

token:  rerb 	suggest:  rer b
token:  rerd 	suggest:  rer d


100%|█████████▉| 376/377 [00:59<00:00,  6.34it/s]

token:  cagade 	suggest:  cage de
token:  lignej 	suggest:  ligne j


100%|██████████| 377/377 [00:59<00:00,  6.34it/s]
