In [1]:
import pandas as pd
import MeCab
import re
import ast
mecab = MeCab.Tagger('-Owakati')
from pathlib import Path
import difflib
# import spacy
# nlp = spacy.load('ja_ginza')
from collections import OrderedDict

In [2]:
corpus = pd.read_csv("very_cleaned_dataset_mobicontrol.csv")

In [522]:
freq_dist = corpus.Data.str.split(expand=True).stack().value_counts()
import json
file = open("/home/iftekhar/amiebot/exp_amiecore/amieCore/amie_core/core/retriever/Page_Ranking_Experiment/pipelines/words_frequency_distribution.json", "w", encoding='utf-8')
json.dump(freq_dist.to_dict(), file, ensure_ascii=False)
file.close()

## TFIDF Analysis

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
context = corpus.Data.values.tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(context)
#print(vectorizer.get_feature_names())
# ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
# print(X.shape)
# (4, 9)

In [561]:
cvec_counts = vectorizer.transform(context)

In [562]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
1938,ます,98.631002
4996,端末,66.491236
5258,設定,29.337458
2390,サーバ,24.864351
5174,表示,24.288604
922,mobicontrol,23.870831
2958,プロファイル,20.095465
4871,登録,19.552362
2043,アプリ,18.319571
5515,選択,17.460408


In [559]:
import numpy as np
weights = np.asarray(X.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
1938,ます,0.221643
4996,端末,0.149419
5258,設定,0.065927
2390,サーバ,0.055875
5174,表示,0.054581
922,mobicontrol,0.053642
2958,プロファイル,0.045158
4871,登録,0.043938
2043,アプリ,0.041168
5515,選択,0.039237


In [563]:
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (445, 5690)
nonzero count: 83728
sparsity: 3.31%


## HashTag Generation

In [218]:
cus_ques = query_corpus_processing(
    "/home/iftekhar/amiebot/Resources/amiebot_dataset/support_team_question_pure.csv")
cus_ques.head()  
cus_ques.to_csv("support_teams_query.csv")

In [4]:
with open('/home/iftekhar/amiebot/exp_amiecore/amieCore/amie_core/core/retriever/Page_Ranking_Experiment'
          '/pipelines/vocabulary.txt') as f:
    vocabulary = f.read().splitlines()

In [5]:
def all_substrings(string):
    n = len(string)
    return {string[i:j+1] for i in range(n) for j in range(i,n)}


def query_corpus_processing(corpus):
    cus_ques = pd.read_csv(corpus)
    cus_ques.Question = cus_ques.Question.apply(lambda x: mecab_tokenization(x))
    cus_ques.Question = cus_ques.Question.apply(lambda x: single_character_remover(x))
    cus_ques.Question = cus_ques.Question.apply(lambda x: cleaner(x))
    return cus_ques

def mecab_tokenization(text):
    q = mecab.parse(text)
    q_parts = q.split()
    return ' '.join([word for word in q_parts if not word in get_stop_word_ja()])


def single_character_remover(text):
    collector = []
    for items in text.split():
        if len(items) < 2:
            replaced = re.sub(r'[ぁ-んァ-ン]', '', items)
            replaced = re.sub(r'[A-Za-z]', '', replaced)
            replaced = re.sub(r'[0-9]', '', replaced)
            collector.append(replaced)
        else:
            collector.append(items)

    return ' '.join([temp.strip(' ') for temp in collector])

def cleaner(text):
    collector = []
    for items in text.split():
        cleaned = clean_text(items)
        cleaned = re.sub(r"\s+", '', cleaned)
        if cleaned is not '' or cleaned is not ' ':
            collector.append(clean_text(items))

    return ' '.join(collector)


def clean_text(text):
    replaced = text.replace("\\", "")
    replaced = replaced.replace("+", "")
    replaced = re.sub('_', '', replaced)
    replaced = re.sub('\W+', ' ', replaced)
    replaced = re.sub(r'￥', '', replaced)  # 【】の除去
    replaced = re.sub(r'．', '', replaced)  # ・ の除去
    replaced = re.sub(r'｣', '', replaced)  # （）の除去
    replaced = re.sub(r'｢', '', replaced)  # ［］の除去
    replaced = re.sub(r'～', '', replaced)  # メンションの除去
    replaced = re.sub(r'｜', '', replaced)  # URLの除去
    replaced = re.sub(r'＠', '', replaced)  # 全角空白の除去
    replaced = re.sub(r'？', '', replaced)  # 数字の除去
    replaced = re.sub(r'％', '', replaced)
    replaced = re.sub(r'＝', '', replaced)
    replaced = re.sub(r'！', '', replaced)
    replaced = re.sub(r'｝', '', replaced)
    replaced = re.sub(r'：', '', replaced)
    replaced = re.sub(r'－', '', replaced)
    replaced = re.sub(r'･', '', replaced)
    replaced = re.sub(r'ｔ', '', replaced)
    replaced = re.sub(r'ｋ', '', replaced)
    replaced = re.sub(r'ｄ', '', replaced)
    replaced = re.sub(r'\d+', '', replaced)

    return replaced

def longest_seq_search(query, page_data):
    m = len(query)
    n = len(page_data)
    counter = [[0] * (n + 1) for x in range(m + 1)]
    longest = 0
    lcs_set = set()
    for i in range(m):
        for j in range(n):
            if query[i] == page_data[j]:
                c = counter[i][j] + 1
                counter[i + 1][j + 1] = c
                if c > longest:
                    lcs_set = set()
                    longest = c
                    lcs_set.add(query[i - c + 1:i + 1])
                elif c == longest:
                    lcs_set.add(query[i - c + 1:i + 1])

    return lcs_set

def get_stop_word_ja():
    stop_word_file = Path("/home/iftekhar/AI-system/Helpers/stop_word_ja.txt")
    with open(stop_word_file, encoding='utf-8') as f:
        stop_word_list = f.read().splitlines()
    return stop_word_list

def corpus_split(corpus, sentence_length):
    labels = corpus.PageID.unique()
    lines = []
    all_ids = []
    for i in list(labels):
        text_list = corpus[corpus.PageID == i].Data.values
        split_text = fixed_length_sentence(' '.join(text_list), sentence_length)
        ids = [i] * len(split_text)
        lines += split_text
        all_ids += ids
    split_corpus = pd.DataFrame(zip(lines, all_ids), columns=["Data", "PageID"])
    return split_corpus

def fixed_length_sentence(contents, word_limit):
    contents_list = contents.split()
    end = len(contents_list)
    count = 0
    collector = []
    line = []
    for items in contents_list:
        if count < word_limit - 1 and end > 1:
            collector.append(items)
            count += 1
        else:
            collector.append(items)
            line.append(' '.join(collector))
            collector = []
            count = 0
        end -= 1
    return line


def split_joint_word(text):
    pattern = re.compile("[A-Z]")
    index_saver = []
    start = -1
    while True:
        m = pattern.search(text, start + 1) 
        if m == None:
            break
        start = m.start()
        index_saver.append(start)
    
    sorted_list = sorted(index_saver)
    range_list=list(range(min(index_saver), max(index_saver)+1))
    if sorted_list != range_list:
        temp = 0
        flag = False
        save = []
        for indexes in index_saver:
            if flag: 
                if indexes - temp > 1:
                    save.append(indexes)
                    temp = indexes
            else:
                save.append(indexes)
                temp = indexes
                flag = True

        if len(save) > 1:
            chunk = text[save[0]:save[1]]
            return chunk, single_character_remover(text.replace(chunk, ''))
        else:
            return None, None
    else:
        return None, None

def english_joint_word_handler(text):
    saver = []
    while text:
        temp = text
        chunk, text = split_joint_word(text)
        saver.append(chunk)
    saver.append(temp)
    saver.remove(None)
    if len(saver) < 2:
        saver = []
    return saver

In [372]:
saver

[]

In [384]:
##### Optional argument n (default 3) is the maximum number of close matches to return; 
# Optional argument cutoff (default 0.6) is a float in the range [0, 1]. 

for index, col in cus_ques.iterrows():
    for items in col['Question'].split():
        if (items.find('MobiControl')==-1) and re.match(r'[A-Za-z]', items):
            chunks_words = english_joint_word_handler(items)
#             print(chunks_words)
            if chunks_words:
                for words in chunks_words:
                    end_flag = len(vocabulary)
                    for voc in vocabulary:
                        if words == voc:
            #                 print("matched: ", items)
                            break
                        elif end_flag < 2:
                            doc = nlp(words)
                            for np in doc.noun_chunks:
                                print("Not Matched: ", words, ", Noun detected: ", np)

                            best_matches = difflib.get_close_matches(words, vocabulary, n = 5, cutoff = 0.5)
                            print("Closest", best_matches)

                            longest_content = []
                            for content in best_matches: 
                                longest_content.append(max(all_substrings(content) & all_substrings(items), key=len))
                            max_term = max(longest_content, key=len)
                            print('max_term: ', max_term)
                            #sequences_list = available_sequences(corpus, max_term)
                            #print(sequences_list)

                            # break
                        end_flag -= 1
                    # break            
        else:    
            end_flag = len(vocabulary)
            for voc in vocabulary:
                if items == voc:
    #                 print("matched: ", items)
                    break
                elif end_flag < 2:
                    doc = nlp(items)
                    for np in doc.noun_chunks:
                        print("Not Matched: ", items, ", Noun detected: ", np)

                    best_matches = difflib.get_close_matches(items, vocabulary, n = 5, cutoff = 0.5)
                    print("Closest", best_matches)

                    longest_content = []
                    for content in best_matches: 
                        longest_content.append(max(all_substrings(content) & all_substrings(items), key=len))
                    max_term = max(longest_content, key=len)
                    print('max_term: ', max_term)
                    #sequences_list = available_sequences(corpus, max_term)
                    #print(sequences_list)

                    # break
                end_flag -= 1
            # break
                

Not Matched:  期限切れ , Noun detected:  期限切れ
Closest ['期限', '切れ', '途切れ', '切れる']
max_term:  期限
Not Matched:  Enterise , Noun detected:  Enterise
Closest ['Enterprise', 'Enerprise', 'Entrust', 'OneDrive', 'Internet']
max_term:  Enter


In [34]:
def unique_tag_provider(matched, token_query_word):
    tags = []
    for items in matched:
        for match in re.finditer(r'# (.*) #', items):
            tags.append(items[match.start()+1: match.end()].split('#'))
    all_tag = []
    for tag_chunk in tags:
        for tag in tag_chunk:
            if tag is not '':
                all_tag.append(tag.strip())
    unique_tags = list(OrderedDict.fromkeys(sorted(all_tag, key=all_tag.count, reverse=True)))
    try:
        unique_tags.remove(token_query_word)
    except ValueError:
        pass
    return unique_tags

def query_in_middle_position(text, match):
    chunk = text[match.start() - 20: match.end() + 20]
    chunk_list = chunk.split()
    chunk_list.pop(0)
    chunk_list.pop(-1)
    return chunk_list

def unique_recommended_all_tags(pages_tags):
    suggest_tags = []
    for tags in [x for sublist in pages_tags for x in sublist]:
        if tags:
            suggest_tags.append(tags)
    suggest_tags = list(OrderedDict.fromkeys(sorted(suggest_tags, key=suggest_tags.count, reverse=True)))
    recommended_tags = []
    for items in suggest_tags:
        recommended_tags.append(single_character_remover(items))
    return recommended_tags
        
def query_at_top_at_beginning(text, match):
    chunk = text[match.start(): match.end() + 40]
    chunk_list = chunk.split()
    chunk_list.pop(-1)
    return chunk_list

def longest_match_within_best_matches(best_matches, items):
    longest_content = []
    for content in best_matches: 
        longest_content.append(max(all_substrings(content) & all_substrings(items), key=len))
    return max(longest_content, key=len)

def tag_chunks(front_seq_word, rear_seq_word):
    rear_queue = []
    count = 0 
    for word in rear_seq_word:
        # if re.match(r'[ァ-ン]', word) or re.match(r'[A-Za-z]', word) and count < 3:
        if count < 3:
            rear_queue.append(word)
        else:
            break
        count += 1

    front_queue = []
    count = 0 
    for word in front_seq_word[::-1]:
        # if re.match(r'[ァ-ン]', word) or re.match(r'[A-Za-z]', word) and count < 3:
        if count < 3:
            front_queue.append(word)
        else:
            break
        count += 1
    front_queue.reverse()   
    return front_queue, rear_queue 

def tags_factory(text, match, pattern):
    front_seq_word = text[match.start()-30: match.end()].split()                 
    rear_seq_word = text[match.start(): match.end() + 30].split()
    # print(front_seq_word, rear_seq_word)
    front_queue, rear_queue = tag_chunks(front_seq_word, rear_seq_word)
    return front_queue, rear_queue


def hash_tag_generator(page_corpus, token_query_word):
    unique_tags = []
    pages_tags = []
    collector = []
    token_query_word = token_query_word
    pattern = token_query_word
    for index, col in page_corpus.iterrows():
        matched = []
        text = col['Data']
        
#         multiple = text.split()
#         if len(text) > 1:
#             longest_match_within_best_matches(vocabulary, items)
            
        for match in re.finditer(pattern, text):
            if match:
                if match.start() > 30:
                    chunk_list = query_in_middle_position(text, match)
                    front_queue, rear_queue = tags_factory(text, match, pattern)
                    matched.append(' '.join(chunk_list + ["#"] + rear_queue + ["#"] + front_queue + ["#"]))                    
                else:
                    matched.append(' '.join(query_at_top_at_beginning(text, match)))
        if matched:
            unique_tags = unique_tag_provider(matched, token_query_word)
            collector.append([col['PageID'], len(matched), unique_tags, matched])
        pages_tags.append(unique_tags)
    tags = unique_recommended_all_tags(pages_tags)

    return tags, sorted(collector, key=lambda l:l[1], reverse=True)[:10]

def making_query_collection(query):
    query_parts = query.split()
    question_parts = []
    for i in range(len(query_parts)):
        if len(query_parts) - 1 > i:
            question_parts.append(query_parts[i] + " " + query_parts[i + 1])
            if len(query_parts) - 2 > i:
                question_parts.append(query_parts[i] + " " + query_parts[i + 1] + " " + query_parts[i + 2])
    return question_parts

In [227]:
def load_dictionary():
    file = open("/home/iftekhar/amiebot/exp_amiecore/amieCore/amie_core/core/retriever/Page_Ranking_Experiment/pipelines/vocabulary_synonyms_all.json", "r")
    contents = file.read()
    corpus_dict = ast.literal_eval(contents)
    file.close()
    return corpus_dict


def query_rewritter_replacing_synonyms(single_token_query, corpus):
    # check the synonyms and convert it to base terms
    collector = []
    for items in single_token_query:
        if corpus.find(items) == -1:
            dict_synonyms = getKeysByValue(load_dictionary(), items)
            if dict_synonyms:
                print("Input Terms: ", items, ' uttered in corpus ', dict_synonyms)
                collector.append(' '.join(dict_synonyms))
        else:
            collector.append(items)
    # rewritten_query = ' '.join([x for sublist in collector for x in sublist])
    # print("Your input becomes: ", ' '.join(collector))
    return collector


def handling_spelling_mistakes(question_parts, vocabulary):
    # Assumed user has spelling mistakes
    collector = []
    for items in question_parts:
        best_matches = difflib.get_close_matches(items, vocabulary, n = 5, cutoff = 0.6)
        if best_matches:
            max_term = longest_match_within_best_matches(best_matches, items)
            collector.append(max_term)
            # print("Closest", best_matches)
    return collector


def how_long_query_matched(collector, whole_corpus):
    flag = True
    for items in collector:
        if whole_corpus.find(items) != -1 and flag is True:
            max_matched = items
            flag = False
        elif whole_corpus.find(max_matched + " " + items) != -1:
            max_matched += " " + items
        else:
            not_matched = items
            break
        
    print("Maximum Sequence Matched: ", max_matched)
    return max_matched, not_matched

def unknown_word_sequence_handler(input_query, vocabulary, corpus_dict, corpus):
    not_matched = 0
    whole_corpus = ' '.join(corpus.Data.values)
    single_token_query = input_query.split()
    question_parts = [input_query] + making_query_collection(input_query) + single_token_query
    # print(question_parts)
    collector = query_rewritter_replacing_synonyms(single_token_query, whole_corpus)
    max_matched, not_matched = how_long_query_matched(collector, whole_corpus)
    voc_hints = handling_spelling_mistakes(question_parts, vocabulary)
    print("Vocab hints from corpus: ", list(set(voc_hints)))
    
    tags, details = hash_tag_generator(corpus, max_matched)
    if tags:
        print("Suggestions: ", tags)
        return tags, not_matched
                
def getKeysByValue(dictOfElements, valueToFind):
    listOfKeys = list()
    listOfItems = dictOfElements.items()
    for item in listOfItems:
        for synonyms in item[1]: 
            if synonyms == valueToFind:
                listOfKeys.append(item[0])
    return listOfKeys

In [None]:
corpus_dict = load_dictionary()
while True:
    input_query = input("Type your query: ") 
    if input_query and input_query is not " ":
        tags, details = hash_tag_generator(corpus, input_query)
        if tags:
            print("Suggested Tags: ", tags)
        else:
            # print('Word/Sequence not found ')
            unknown_word_sequence_handler(input_query, vocabulary, corpus_dict, corpus)
            

In [233]:
for index, col in cus_ques.iterrows():
    print("________\nInput Query: ", col['Question'])
    input_query = col['Question']
    tags, details = hash_tag_generator(corpus, input_query)
    if tags:
        print("Suggested Tags: ", tags)
    else:
        # print('Word/Sequence not found ')
        tags, not_matched = unknown_word_sequence_handler(input_query, vocabulary, corpus_dict, corpus)
    
    
    for chunks in tags:
        print(chunks)
        if chunks.find(not_matched) != -1:
            print(chunks)
    break    

________
Input Query:  ウィルス 対策 実施
Maximum Sequence Matched:  ウィルス 対策
Vocab hints from corpus:  ['実施', 'ウィルス', '対策']
Suggestions:  ['ウィルス 対策 定義', 'ウィルス 対策 スキャン', 'ウィルス 対策 検疫', '期間 ウィルス 対策', '説 ウィルス 対策', '社 ウィルス 対策', '表示 ウィルス 対策', 'ウィルス 対策 表', '番号 ウィルス 対策', 'ます ウィルス 対策', 'ウィルス 対策 ソフト', 'ウィルス 対策 端末', 'ウィルス 対策 管理', 'メニュー ウィルス 対策', 'ウィルス 対策 選択', '画面 ウィルス 対策', 'ウィルス 対策 ホワイト', 'ので ウィルス 対策', 'ウィルス 対策 ファイル', 'ない ウィルス 対策', 'ませ ウィルス 対策', 'Enterprise ウィルス 対策', 'ATP ウィルス 対策', 'ステータス ウィルス 対策', 'ID ウィルス 対策', 'バージョン ウィルス 対策', '日時 ウィルス 対策', 'Plus ウィルス 対策', 'ファイル ウィルス 対策', '従って ウィルス 対策', '必ず ウィルス 対策', '攻撃 ウィルス 対策', '検出 ウィルス 対策', 'Protection ウィルス 対策', 'により ウィルス 対策', 'サイト ウィルス 対策', '正常 ウィルス 対策', '問題 ウィルス 対策', '明 ウィルス 対策', 'いる ウィルス 対策', 'SOTI ウィルス 対策', '定期 ウィルス 対策', '端末 ウィルス 対策', '探す ウィルス 対策']
ウィルス 対策 定義
ウィルス 対策 スキャン
ウィルス 対策 検疫
期間 ウィルス 対策
説 ウィルス 対策
社 ウィルス 対策
表示 ウィルス 対策
ウィルス 対策 表
番号 ウィルス 対策
ます ウィルス 対策
ウィルス 対策 ソフト
ウィルス 対策 端末
ウィルス 対策 管理
メニュー ウィルス 対策
ウィルス 対策 選択
画面 ウィルス 対策
ウィルス 対策 ホワイト
ので ウィルス 対策
ウィルス 対策 ファイル
な

In [229]:
not_matched

'実施'

In [None]:
matches = difflib.SequenceMatcher(
    None, string1, string2).get_matching_blocks()
for match in matches:
    print(string1[match.a:match.a + match.size])
