In [1]:
import neologdn
import re

In [2]:
def replace_with_regex(text, remove_mention=True, remove_url=True):
    text = text.lower()
    replaced_text = re.sub(r'[【】]', ' ', text)  # 【】の除去
    replaced_text = re.sub(r'[・_!！？?☛]', '', replaced_text)  # ・ の除去
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)  # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)  # ［］の除去
    replaced_text = re.sub(r'　', ' ', replaced_text)  # 全角空白の除去
    replaced_text = re.sub(r'[⑤⑥②①③④⑦⑧⑨⑩]', '', replaced_text)
    replaced_text = re.sub(r'\d+', '', replaced_text)  # 数字の除去
    replaced_text = re.sub(r'[/。,、.=]', ' ', replaced_text)  # others
    replaced_text = re.sub(r'[●■]', '', replaced_text)
    if remove_mention:
        replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    if remove_url:
        replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    return replaced_text

In [3]:
text = 'This Web site uses JavaScript. Please let your browser settings enable JavaScript. Mobile Search Close Please choose the service you want to know Previous Next iPhone SE iPhone 11 iPhone 11 Pro iPad Pro Google Pixel 4a Getting a handset in Japan? Visit a SoftBank shop! We will be happy to help you. For new contr acts (switchover), upgrades, and various applications, please make sure the following information. Find a shop where you can speak English We have shops with full-time English speaking staff. What you need to prepare Information on required items when purchasing a handset or changing subscription content, and more. See our multilingua l catalogs We have（digital）catalogs available in English, Chinese, Spanish, Korean, Vietnamese, and Portuguese. Need help? If you have any questions, please contact o ur customer support. We provide support in English. Products Special English-speaking staff are available at these locations Account and Services Social Media Use socia l media to follow us for up-to-date news and details of our latest activities (Japanese only) Font size Registration number(Telecommunications carrier):No.72 Top of pag e.'

In [4]:
text = neologdn.normalize(str(text))
text = replace_with_regex(text)

In [5]:
text

'this web site uses javascript  please let your browser settings enable javascript  mobile search close please choose the service you want to know previous next iphone se iphone  iphone  pro ipad pro google pixel a getting a handset in japan visit a softbank shop we will be happy to help you  for new contr acts  switchover   upgrades  and various applications  please make sure the following information  find a shop where you can speak english we have shops with full-time english speaking staff  what you need to prepare information on required items when purchasing a handset or changing subscription content  and more  see our multilingua l catalogs we have digital catalogs available in english  chinese  spanish  korean  vietnamese  and portuguese  need help if you have any questions  please contact o ur customer support  we provide support in english  products special english-speaking staff are available at these locations account and services social media use socia l media to follow us

In [8]:
import MeCab 
tagger = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")

In [9]:
from collections import defaultdict, namedtuple
NOUN_POS = ('名詞', '動詞', '形容詞', '接頭詞',)
STOP_POS = ('記号', 'フィラー',)
NEW_WORD = '固有名詞'

In [20]:
words, new_words = _filter_by_pos(text, NOUN_POS)
# words = correct_words(words, checker)

In [21]:
words

['this',
 'web',
 'site',
 'uses',
 'JavaScript',
 'please',
 'let',
 'your',
 'browser',
 'settings',
 'Enable',
 'JavaScript',
 'mobile',
 'search',
 'CLOSE',
 'please',
 'choose',
 'the',
 'service',
 'YOU',
 'want',
 'TO',
 'know',
 'previous',
 'NeXT',
 'iPhone SE',
 'iPhone',
 'iPhone',
 'pro',
 'iPad Pro',
 'Google Pixel',
 'a',
 'getting',
 'a',
 'handset',
 'in',
 'JAPAN',
 'visit',
 'a',
 'SoftBank',
 'shop',
 'we',
 'WILL',
 'be',
 'HAPPY',
 'TO',
 'help',
 'YOU',
 'for',
 'new',
 'contr',
 'acts',
 'SWITCH',
 'OVER',
 'upgrades',
 'AND',
 'various',
 'applications',
 'please',
 'make',
 'sure',
 'the',
 'FoLLoW',
 'ING',
 'information',
 'find',
 'a',
 'shop',
 'where',
 'YOU',
 'can',
 'speak',
 'english',
 'we',
 'have',
 'shops',
 'WITH',
 'full',
 'time',
 'english',
 'speaking',
 'staff',
 'what',
 'YOU',
 'NEED',
 'TO',
 'prepare',
 'information',
 'ON',
 'required',
 'items',
 'when',
 'purchasing',
 'a',
 'handset',
 'or',
 'changing',
 'subscription',
 'content',
 

In [14]:
def _get_new_word(token):
    if token.reading == '' and token.phonetic == '' and token.pos_detail1 == NEW_WORD and len(token.surface) > 1:
        return token.surface
    return None

def _filter_by_pos(sent, pos):
    base_forms = []
    new_words = []
    for token in _tokenize(sent):
        if token.pos in pos:
            if token.base_form != '*':
                base_forms.append(token.base_form)
            else:
                base_forms.append(token.surface)
            new_word = _get_new_word(token)
            if new_word is not None:
                new_words.append(new_word)
    return base_forms, new_words

def _tokenize(text):
    chunks = tagger.parse(text.rstrip()).splitlines()[:-1]  # Skip EOS
    token = namedtuple('Token', 'surface, pos, pos_detail1, pos_detail2, pos_detail3, '
                                'infl_type, infl_form, base_form, reading, phonetic')
    for chunk in chunks:
        if chunk == '':
            continue
        try:
            surface, feature = chunk.split('\t')
            feature = feature.split(',')
            if len(feature) <= 7:  # 読みがない
                feature.append('')
            if len(feature) <= 8:  # 発音がない
                feature.append('')
            yield token(surface, *feature)
        except:
            print('Exception occurred')


def collect_english_words(raw_texts):
    collector = []
    for items in ' '.join(map(str, raw_texts)).lower().split():
        if bool(re.compile('.*[a-z].*').match(items)):
            clean = re.sub(r'[^a-z]', '', items)
            if len(clean) > 1:
                collector.append(clean)
    # print(collector)
    with open("correction_vocab.txt", 'w') as out:
        out.writelines("%s\n" % vocab for vocab in list(set([x for x in collector if len(x) > 1])))


In [17]:
collect_english_words(text)

In [18]:
import enchant
checker = enchant.DictWithPWL("en","correction_vocab.txt")


In [77]:
words = ['t', 'HIS', 'web', 'SIT', 'e', 'u', 'SES', 'p', 'l', 'ease', 'Ll', 'e', 't', 'r', 's', 'Ett', 'ING', 's', 'SEA', 'r', 'c', 'h', 'p', 'l', 'ease', 'c', 'h', 'o', 'OSE', 'the', 'SE', 'r', 'vice', 'you', 'w', 'ant', 't', 'o', 'k', 'NOW', 'p', 'r', 'evio', 'us', 'p', 'r', 'o', 'a', 'GET', 't', 'ING', 'a', 'h', 'ANDS', 'e', 't', 't', 'n', 'vis', 't', 't', 'a', 'SHO', 'p', 'w', 'e', 'b', 'e', 't', 'o', 'h', 'ELP', 'n', 'e', 'w', 'c', 'o', 'NTR', 'a', 'CTS', 'pl', 'easel', 'sear', 'ch', 'p', 'l', 'ease', 'c', 'ho', 'ose']

In [None]:
correct_words(words, checker)

In [78]:
p = 0
while p < 3: 
    words = correct_words(words, checker)
    if not [x for x in words if len(x) < 2]:
        break
    p += 1

In [79]:
words

['t',
 'his',
 'web',
 'site',
 'uses',
 'pl',
 'lease',
 'll',
 'e',
 'tr',
 'settings',
 'search',
 'hp',
 'lease',
 'choose',
 'these',
 'r',
 'vice',
 'you',
 'want',
 'to',
 'know',
 'previous',
 'pro',
 'a',
 'getting',
 'ting',
 'ah',
 'handset',
 'tn',
 'vis',
 't',
 'ta',
 'shop',
 'we',
 'bet',
 'o',
 'help',
 'new',
 'contra',
 'acts',
 'pl',
 'easel',
 'search',
 'pl',
 'ease',
 'choose']

In [75]:
def correct_words(words, checker):
    q = 2
    words = [x.lower() for x in words]
    correction = defaultdict(list)
    i = 0
    for items in words:
        flag = 1
        if len(items) < 3 and bool(re.compile('[a-z]').match(items)):
            # print(items, '=', i)
            try:
                k = 0
                while k < q:
                    check_word, collect_index = collect_word_index(words, i, q, k)
                    if checker.check(check_word):
                        correction[check_word].append(collect_index)
                        flag = 0
                        break
                    k += 1
                if flag:
                    k = 0
                    while k < q+1:
                        check_word, collect_index = collect_word_index(words, i, q+1, k)
                        if checker.check(check_word):
                            correction[check_word].append(collect_index)
                            flag = 0
                            break
                        k += 1
                if flag:
                    k = 0
                    while k < q+2:
                        check_word, collect_index = collect_word_index(words, i, q+2, k)
                        if checker.check(check_word):
                            correction[check_word].append(collect_index)
                            break
                        k += 1

            except (IndexError, ValueError) as e:
                pass
        i += 1
    return get_corrected_words(correction, words)


In [76]:
def collect_word_index(words, i, j, k):
    check_word = ''
    collect_index = []
    while j > 0:
        index = i - j + k
        if not (index < 0 or index > len(words)): 
            check_word += str(words[i - j + k])
            collect_index.append(i - j + k)
        j -= 1
    return check_word, collect_index


In [19]:
def correct_words(words, checker):
    words = [x.lower() for x in words]
    correction = defaultdict(list)
    i = 0
    for items in words:
        flag = 1
        if len(items) < 3 and bool(re.compile('[a-z]').match(items)):
            # print(items, '=', i)
            try:
                k = 0
                while k < 4:
                    check_word, collect_index = collect_word_index(words, i, 4, k)
                    if checker.check(check_word):
                        correction[check_word].append(collect_index)
                        flag = 0
                        break
                    k += 1
                if flag:
                    k = 0
                    while k < 3:
                        check_word, collect_index = collect_word_index(words, i, 3, k)
                        if checker.check(check_word):
                            correction[check_word].append(collect_index)
                            flag = 0
                            break
                        k += 1
                if flag:
                    k = 0
                    while k < 2:
                        check_word, collect_index = collect_word_index(words, i, 2, k)
                        if checker.check(check_word):
                            correction[check_word].append(collect_index)
                            break
                        k += 1

            except IndexError:
                pass
        i += 1
    words = get_corrected_words(correction, words)
    correction = defaultdict(list)
    i = 0
    for items in words:
        flag = 1
        if len(items) < 3 and bool(re.compile('[a-z]').match(items)):
            # print(items, '=', i)
            try:
                k = 0
                while k < 3:
                    check_word, collect_index = collect_word_index(words, i, 3, k)
                    if checker.check(check_word):
                        correction[check_word].append(collect_index)
                        flag = 0
                        break
                    k += 1
                if flag:
                    k = 0
                    while k < 2:
                        check_word, collect_index = collect_word_index(words, i, 2, k)
                        if checker.check(check_word):
                            correction[check_word].append(collect_index)
                            break
                        k += 1

            except IndexError:
                pass
        i += 1
    return get_corrected_words(correction, words)


def get_corrected_words(correction, words):
    for key, val in correction.items():
        flag = 1
        for items in val[0]:
            if flag:
                words[items] = key
                flag = 0
            else:
                words[items] = ''

    return [x for x in words if x is not '']
