In [23]:
import unicodedata

In [27]:
def _is_japanese(text):
    for ch in text:
        try:
            name = unicodedata.name(ch)
            print(name)
            if "HIRAGANA" in name \
                    or "KATAKANA" in name:
                return True
        except Exception as e:
            print(str(e))
    return False

In [30]:
_is_japanese('有給休暇、週休二日制、週休二日制の概念')

CJK UNIFIED IDEOGRAPH-6709
CJK UNIFIED IDEOGRAPH-7D66
CJK UNIFIED IDEOGRAPH-4F11
CJK UNIFIED IDEOGRAPH-6687
IDEOGRAPHIC COMMA
CJK UNIFIED IDEOGRAPH-9031
CJK UNIFIED IDEOGRAPH-4F11
CJK UNIFIED IDEOGRAPH-4E8C
CJK UNIFIED IDEOGRAPH-65E5
CJK UNIFIED IDEOGRAPH-5236
IDEOGRAPHIC COMMA
CJK UNIFIED IDEOGRAPH-9031
CJK UNIFIED IDEOGRAPH-4F11
CJK UNIFIED IDEOGRAPH-4E8C
CJK UNIFIED IDEOGRAPH-65E5
CJK UNIFIED IDEOGRAPH-5236
HIRAGANA LETTER NO


True

In [33]:
import re


def cjk_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return "ko"
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return "ja"
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return "zh"
    return None


def test_cjk_detect():
    # Simplified Chinese
    assert cjk_detect(
        "2009年，波音公司(Boeing)在查尔斯顿附近的新厂破土动工时，曾宣扬这里是最先进的制造中心"
        "，将制造一款世界上最先进的飞机。但在接下来的十年里，这家生产787梦想客机的工厂一直受到做"
        "工粗糙和监管不力的困扰，危及航空安全。") == "zh"
    # Traditional Chinese
    assert cjk_detect(
        "北查爾斯頓工廠的安全漏洞已經引起了航空公司和監管機構的密切關注。") == "zh"
    # Japanese
    assert cjk_detect(
        "日産自動車は24日、2019年3月期の連結業績予想を下方修正した。") == "ja"
    # Korean
    assert cjk_detect(
        "투서로 뜨고 투서에 지나") == "ko"
    # Korean with a Chinese character
    assert cjk_detect(
        "北 외무성 간부 총살설 주민들 사이서 확산…하노이 회담 실패 때문") == "ko"


def print_incorrect_cases():
    # Japanese
    texts = "日産自動車、営業益45%減　前期下方修正"
    texts = '有給休暇、週休二日制、週休二日制'
    print(texts, "expected: ja actual:", cjk_detect(texts))
    # Traditional Chinese with Japanese hiragana
    texts = "健康の油切 好吃の涼麵"
    print(texts, "expected: zh actual:", cjk_detect(texts))
    # Traditional Chinese with Japanese katakana punctuation
    texts = "鐵腕・都鐸王朝（五）：文藝復興最懂穿搭的高富帥——亨利八世"
    print(texts, "expected: zh actual:", cjk_detect(texts))


if __name__ == "__main__":
    # Correct cases
    test_cjk_detect()
    # Incorrect cases
    print_incorrect_cases()

有給休暇、週休二日制、週休二日制 expected: ja actual: zh
健康の油切 好吃の涼麵 expected: zh actual: ja
鐵腕・都鐸王朝（五）：文藝復興最懂穿搭的高富帥——亨利八世 expected: zh actual: ja


In [9]:
import cld3

In [22]:
lang = cld3.get_language('有給休暇、週休二日制、週休二日制')[0]
lang

'zh'

In [None]:
#t= "有給休暇、週休二日制、週休二日制"
t="Yes I"
#t="有給休暇"
#t = '防水層'

In [59]:
import _pickle as pickle

In [60]:
def load_from_file(file_path):
    with open(file_path, "rb") as input_file:
        data = pickle.load(input_file)
    return data

In [64]:
word_dic = load_from_file('/home/iftekhar/2343/model/retrieve_word_dic.pk')

In [65]:
[x for x in word_dic.keys() if x in '大容量プラン']

['プラン', '容量', '量', 'ラン', 'ン', '', '大', '容', 'ラ', 'プラ', 'プ']

In [42]:
import pandas as pd

In [52]:
df = pd.read_csv('/home/iftekhar/WebMaster/app/faq/data/nlu_dict_raw_data_frame.csv')

In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,query,tag
0,1019,That is as good as I expected.,praise
1,705,can you connect me to sales,contact_staffs
2,389,oh are you chatbot?,ask_isbot
3,1077,How creative,praise
4,393,what is the time in Tokyo?,ask_time


In [45]:
len(df['query'])

1238

In [46]:
text = ['abcd']

In [50]:
x = len(df) * ['abcd']

In [58]:
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
keyword_processor.add_keyword('Big Apple')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love bigApple and Bay Area.')
keywords_found

['Bay Area']