# 言語処理100本ノック

https://nlp100.github.io/ja/

## 第1章 準備運動

In [0]:
# 00. 文字列の逆順
# スライスは[start:stop:step]の形で範囲や増分を指定する。
# start, stopを省略すると全体を選択し、stepを-1とすると後ろから一つずつ要素を
# 取得することになるので[::-1]とすると逆順に並べ替えられたオブジェクトが取得できる。

s = 'stressed'
print(s[::-1])

desserts


In [0]:
# 01. 「パタトクカシーー」

s = 'パタトクカシーー'
print(s[::2])

パトカー


In [0]:
# 02. 「パトカー」＋「タクシー」＝「パタトクカシーー」

p = 'パトカー'
t = 'タクシー'

s = ''
for i in range(4):
    s += p[i] + t[i]
print(s)

パタトクカシーー


In [0]:
# 03. 円周率

import re

s = 'Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics.'

print([len(n) for n in re.findall(r"[\w']+", s)])

[3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9]


In [0]:
# 04. 元素記号

import re

s = 'Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can.'
index = [1, 5, 6, 7, 8, 9, 15, 16, 19]

ws = re.findall(r"[\w']+", s)

d = {}
for i, w in enumerate(ws):
    if i+1 in index:
        d[w[0]] = i+1
    else:
        d[w[:2]] = i+1

print(d)

{'H': 1, 'He': 2, 'Li': 3, 'Be': 4, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Ne': 10, 'Na': 11, 'Mi': 12, 'Al': 13, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Ar': 18, 'K': 19, 'Ca': 20}


In [0]:
# 05. n-gram

from more_itertools import chunked
import nltk
nltk.download("popular", quiet=True)

s = 'I am an NLPer'

cs = s.replace(' ', '')
ws = nltk.word_tokenize(s)

def n_gram(num, str_):
    return chunked(str_, num)

print(list(n_gram(2, cs)))
print(list(n_gram(2, ws)))

[['I', 'a'], ['m', 'a'], ['n', 'N'], ['L', 'P'], ['e', 'r']]
[['I', 'am'], ['an', 'NLPer']]


In [0]:
# 06. 集合

s1 = 'paraparaparadise'
s2 = 'paragraph'
s3 = 'se'

X = set(tuple(ws) for ws in n_gram(2, s1))
Y = set(tuple(ws) for ws in n_gram(2, s2))
Z = set(tuple(ws) for ws in n_gram(2, s3)).pop()

print('X: {}'.format(X))
print('Y: {}'.format(Y))
print('Z: {}'.format(Z))

print('Union: {}'.format(X | Y))
print('Intersection: {}'.format(X & Y))
print('Difference: {}'.format(X - Y))
print('"se" in X: {}'.format(Z in X))
print('"se" in Y: {}'.format(Z in Y))

X: {('d', 'i'), ('p', 'a'), ('r', 'a'), ('s', 'e')}
Y: {('h',), ('g', 'r'), ('a', 'p'), ('p', 'a'), ('r', 'a')}
Z: ('s', 'e')
Union: {('h',), ('g', 'r'), ('s', 'e'), ('d', 'i'), ('p', 'a'), ('a', 'p'), ('r', 'a')}
Intersection: {('p', 'a'), ('r', 'a')}
Difference: {('s', 'e'), ('d', 'i')}
"se" in X: True
"se" in Y: False


In [0]:
# 07. テンプレートによる文生成

x = 12
y = '気温'
z = 22.4

def templ(x, y, z):
    return '{}時の{}は{}'.format(x, y, z)

print(templ(x, y, z))

12時の気温は22.4


In [0]:
# 08. 暗号文

def cipher(str_):
    res = ''
    for c in str_:
        if c.islower():
            res += chr(219 - ord(c))
        else:
            res += c
    return res

s = 'Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can.'

s_en = cipher(s)
s_de = cipher(s_en)

print(s_en)
print(s_de)

Hr Hv Lrvw Bvxzfhv Blilm Clfow Nlg Ocrwrav Foflirmv. Nvd Nzgrlmh Mrtsg Aohl Srtm Pvzxv Svxfirgb Cozfhv. Aigsfi Krmt Czm.
Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can.


In [0]:
# 09. Typoglycemia

import random

s = 'I couldn’t believe that I could actually understand what I was reading : the phenomenal power of the human mind .'

ss = s.split()

def shuffle(str_):
    return str_[0] + ''.join(random.sample(str_, len(str_) - 2)) + str_[-1]

res = []
for w in ss:
    if len(w) <= 4:
        res.append(w)
    else:
        res.append(shuffle(w))

print(res)

['I', 'cn’lcott', 'bleveee', 'that', 'I', 'cucld', 'aulltayy', 'urdnnetsud', 'what', 'I', 'was', 'rrginag', ':', 'the', 'palonenpel', 'ppeor', 'of', 'the', 'hmunn', 'mind', '.']
