In [20]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [21]:
es = Elasticsearch("localhost:9200", http_compress=True)

In [23]:
df.columns.values.tolist()

['page', 'title', 'text', 'key_phrase']

In [27]:
import pandas as pd
df = pd.read_csv('docomo_cleaned.csv')
# df = pd.read_csv('Shimizu_cleaned.csv') 

In [13]:
df.columns

Index(['Unnamed: 0', 'page', 'title', 'text'], dtype='object')

In [30]:
df.isnull().sum()

Unnamed: 0    0
page          0
title         0
text          0
key_phrase    0
dtype: int64

In [29]:
df = df.fillna('0')

In [17]:
use_these_keys = df.columns.values.tolist()
# use_these_keys = ['page', 'title', 'text']
def filterKeys(document):
    return {key: document[key] for key in use_these_keys }

In [31]:
def doc_generator(df, index_name):
    df_iter = df.iterrows()
    i = 0
    for index, document in df_iter:
        yield {
                "_index": index_name,
                "_type": "_doc",
                "_id" : f"{document['page']}",
                "_source": filterKeys(document),
            }
#         print('ok:', i)
        i += 1 
#     raise StopIteration


In [26]:
helpers.bulk(es, doc_generator(df, 'shimuzu'))



(2957, [])

In [32]:
helpers.bulk(es, doc_generator(df, 'docomo'))

(1364, [])

### Preprocessing is happening below

In [2]:
import pandas as pd
import MeCab
import re
import random
from pathlib import Path


mecab = MeCab.Tagger('-Owakati')

In [1]:
def single_character_remover(text):
    collector = []
    for items in text.split():
        if len(items) < 2:
            replaced = re.sub(r'[ぁ-んァ-ン]', '', items)
            replaced = re.sub(r'[A-Za-z]', '', replaced)
            replaced = re.sub(r'[0-9]', '', replaced)
            collector.append(replaced)
        else:
            collector.append(items)

    return ' '.join([temp.strip(' ') for temp in collector])

def get_stop_word_ja():
    stop_word_file = Path("/home/iftekhar/amiecore_fresh/amieCore/amie_core/core/tokenizer/stopwords/stop_word_ja.txt")
    with open(stop_word_file, encoding='utf-8') as f:
        stop_word_list = f.read().splitlines()
    return stop_word_list

def mecab_tokenization(text):
    q = mecab.parse(text)
    q_parts = q.split()
    return ' '.join([word for word in q_parts if not word in get_stop_word_ja()])

def cleaner(text):
    collector = []
    for items in text.split():
        cleaned = clean_text(items)
        cleaned = re.sub(r"\s+", '', cleaned)
        if cleaned is not '' or cleaned is not ' ':
            collector.append(clean_text(items))
    return ' '.join(collector)

def clean_text(text):
    replaced = text.replace("\\", "")
    replaced = replaced.replace("+", "")
    replaced = re.sub('_', '', replaced)
    replaced = re.sub('\W+', ' ', replaced)
    replaced = re.sub(r'￥', '', replaced)  # 【】の除去
    replaced = re.sub(r'．', '', replaced)  # ・ の除去
    replaced = re.sub(r'｣', '', replaced)  # （）の除去
    replaced = re.sub(r'｢', '', replaced)  # ［］の除去
    replaced = re.sub(r'～', '', replaced)  # メンションの除去
    replaced = re.sub(r'｜', '', replaced)  # URLの除去
    replaced = re.sub(r'＠', '', replaced)  # 全角空白の除去
    replaced = re.sub(r'？', '', replaced)  # 数字の除去
    replaced = re.sub(r'％', '', replaced)
    replaced = re.sub(r'＝', '', replaced)
    replaced = re.sub(r'！', '', replaced)
    replaced = re.sub(r'｝', '', replaced)
    replaced = re.sub(r'：', '', replaced)
    replaced = re.sub(r'－', '', replaced)
    replaced = re.sub(r'･', '', replaced)
    replaced = re.sub(r'ｔ', '', replaced)
    replaced = re.sub(r'ｋ', '', replaced)
    replaced = re.sub(r'ｄ', '', replaced)
    replaced = re.sub(r'\d+', '', replaced)
    return replaced

def corpus_preprocessing(dataset):
    dataset.text = dataset.text.apply(lambda x: mecab_tokenization(x))
    dataset.text = dataset.text.apply(lambda x: cleaner(x))
    dataset.text = dataset.text.apply(lambda x: single_character_remover(x))
    dataset.title = dataset.title.apply(lambda x: mecab_tokenization(x))
    dataset.title = dataset.title.apply(lambda x: cleaner(x))
    dataset.title = dataset.title.apply(lambda x: single_character_remover(x))
    return dataset

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('/home/iftekhar/amiebot/Resources/amiebot_dataset/shimizu/data/merged_corpus.csv')

In [8]:
df.head()

Unnamed: 0,page,title,text
0,0,【施工管理標準】ME-10　冷媒用銅管_TAG_設備工事_空調設備_給排水衛生設備.pdf,施工管理標準 ME 冷媒用銅管 接合 保温 年 月 日 凡例 赤の文字 ...
1,1,【施工管理標準】ME-10　冷媒用銅管_TAG_設備工事_空調設備_給排水衛生設備.pdf,施工管理の内容 同 解説 業者の 検査を 確認 清水 建設 検査 記録方法 保管...
2,2,【施工管理標準】ME-10　冷媒用銅管_TAG_設備工事_空調設備_給排水衛生設備.pdf,施工管理の内容 同 解説内は外れた場合の処置 業者の 検査を 確認 清水 建設 検査...
3,3,【施工管理標準】ME-10　冷媒用銅管_TAG_設備工事_空調設備_給排水衛生設備.pdf,ロックジョイント おっぞんくん RGジョイント ファイアレスジョイント ARジョイ...
4,4,【施工管理標準】ME-10　冷媒用銅管_TAG_設備工事_空調設備_給排水衛生設備.pdf,施工管理の内容 同 解説内は外れた場合の処置 業者の 検査を 確認 清水 建設 検査...


In [9]:
df = corpus_preprocessing(df)

In [10]:
df.to_csv('Shimizu_cleaned.csv')