# 顔文字や絵文字、URLを除去する

### 参考資料
https://qiita.com/dcm_murakami/items/4c016936a739bfb2a517

https://upura.hatenablog.com/entry/2018/09/18/203540

In [1]:
!pip install nagisa emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nagisa
  Downloading nagisa-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.6 MB)
[K     |████████████████████████████████| 21.6 MB 1.6 MB/s 
[?25hCollecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 67.3 MB/s 
[?25hCollecting DyNet38
  Downloading dyNET38-2.1-cp38-cp38-manylinux1_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 40.2 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=bfc4a067e70b0edf43d22f26b469dfcd9f541eae2cda18bb53d471ad75f2203b
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected pac

In [2]:
%cd '/content/drive/MyDrive/grad_comp/data'

/content/drive/MyDrive/grad_comp/data


In [3]:
import nagisa
import unicodedata
import re

KAOMOJI_LEN = 5

def extract_kaomoji(text):
    """ 与えられたテキストから抽出した顔文字リストを返却する。
        → ＼(^o^)／, m(_ _)m などの 手を含む顔文字があれば、それも抽出する
    """
    results = nagisa.extract(text, extract_postags=['補助記号'])
    words = results.words
    kaomoji_words = []
    kaomoji_idx = [i for i, w in enumerate(words) if len(w) >= KAOMOJI_LEN]
    kaomoji_hands = ['ノ', 'ヽ', '∑', 'm', 'O', 'o', '┐', '/', '\\', '┌'] 
    # 顔文字と手を検索
    for i in kaomoji_idx:
        kaomoji = words[i] # 顔文字列
        try:
            # 顔文字の左手
            if words[i-1] in kaomoji_hands and 0 < i:
                kaomoji = words[i-1] + kaomoji
            # 顔文字の右手
            if words[i+1] in kaomoji_hands:
                 kaomoji = kaomoji + words[i+1]
        except IndexError:
            pass
        finally:
            kaomoji_words.append(kaomoji)
    return kaomoji_words

# 顔文字除去したテキストを返す関数
def remove_kaomoji(text):
  res = nagisa.extract(text, extract_postags=['補助記号'])
  kaomoji = res.words # 顔文字リスト
  kaomoji = [t for t in kaomoji if t not in ['、','。','...','?', '?', '!', '!']] # 対象外
  words = nagisa.tagging(text)
  tokens = words.words
  remove_list = [t for t in tokens if t not in kaomoji]
  remove_text = ''.join(remove_list)
  return remove_text

text = "今日は渋谷スクランブルスクエアに行ってきた＼(^o^)／ 夜景🏙サイコー❗️ https://hogehogehogehoge.jpg"
text = unicodedata.normalize('NFKC', text) # NFKC正規化
print(extract_kaomoji(text))
# => ['\\(^o^)/']

text = "ごめんなさいm(-_-)m"
text = unicodedata.normalize('NFKC', text) # NFKC正規化
print(extract_kaomoji(text))
# => ['m(-_-)m']

['\\(^o^)/']
['m(-_-)m']


In [57]:
# ファイル処理
filename = ['train','dev','test']

for name in filename:
  with open('text.' + name + '.txt') as fr,open('../preprocess/text.prep_' + name + '.txt',mode='w') as fw:
    for line in fr:
      line = re.sub(r'\s+', '', line) # 空白削除
      line = unicodedata.normalize('NFKC', line) # NFKC正規化
      re_line = remove_kaomoji(line) # 顔文字抽出
      fw.write(re_line + '\n')