# 言語処理100本ノック

https://nlp100.github.io/ja/

## 第2章: UNIXコマンド

In [15]:
!curl -sO https://nlp100.github.io/data/popular-names.txt
!ls popular-names.txt

popular-names.txt


In [16]:
# 10. 行数のカウント

print('##### Python #####')
with open('popular-names.txt', 'r') as f:
    print(len(f.readlines()))

print('##### Shell #####')
!wc -l popular-names.txt

##### Python #####
2780
##### Shell #####
2780 popular-names.txt


In [17]:
# タブをスペースに置換

print('##### Python #####')
with open('popular-names.txt', 'r') as f:
    s = f.read().replace('\t', ' ')
with open('popular-names-space-python.txt', 'w') as f:
    f.write(s)

print('##### Shell #####')
!sed -e 's/\t/ /g' popular-names.txt > popular-names-space-shell.txt

print('##### Test - diff #####')
!diff popular-names-space-python.txt popular-names-space-shell.txt

print('##### Test - head #####')
!head popular-names-space-python.txt

##### Python #####
##### Shell #####
##### Test - diff #####
##### Test - head #####
Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


In [18]:
# 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

print('##### Python #####')
col1 = []
col2 = []

with open('popular-names.txt', 'r') as f:
    for l in f.readlines():
        ls = l.split('\t')
        col1.append(ls[0])
        col2.append(ls[1])

with open('col1.txt', 'w') as f:
    f.write('\n'.join(col1))

with open('col2.txt', 'w') as f:
    f.write('\n'.join(col2))

print('##### Shell #####')
!cut -f 1 popular-names.txt > col1-shell.txt
!cut -f 2 popular-names.txt > col2-shell.txt

print('##### Test - diff - col1 #####')
!diff col1.txt col1-shell.txt

print('##### Test - diff - col2 #####')
!diff col2.txt col2-shell.txt

print('##### Test - head - col1 #####')
!head col1.txt

print('##### Test - head - col2 #####')
!head col2.txt

##### Python #####
##### Shell #####
##### Test - diff - col1 #####
2780c2780
< Logan
\ No newline at end of file
---
> Logan
##### Test - diff - col2 #####
2780c2780
< M
\ No newline at end of file
---
> M
##### Test - head - col1 #####
Mary
Anna
Emma
Elizabeth
Minnie
Margaret
Ida
Alice
Bertha
Sarah
##### Test - head - col2 #####
F
F
F
F
F
F
F
F
F
F


In [19]:
# 13. col1.txtとcol2.txtをマージ

print('##### Python #####')
with open('col1.txt', 'r') as f:
    c1 = f.readlines()

with open('col2.txt', 'r') as f:
    c2 = f.readlines()

cm = []
for i in range(len(c1)):
    cm.append('{}\t{}'.format(c1[i].strip(), c2[i].strip()))

with open('col1-col2-python.txt', 'w') as f:
    f.write('\n'.join(cm))

print('##### Shell #####')
!paste col1.txt col2.txt > col1-col2-shell.txt

print('##### Test - diff #####')
!diff col1-col2-python.txt col1-col2-shell.txt

print('##### Test - head #####')
!head col1-col2-python.txt

##### Python #####
##### Shell #####
##### Test - diff #####
2780c2780
< Logan	M
\ No newline at end of file
---
> Logan	M
##### Test - head #####
Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F


In [20]:
# 14. 先頭からN行を出力

print('##### Python #####')
def head(num):
    with open('popular-names.txt', 'r') as f:
        print(''.join(f.readlines()[:num]))

head(2)

print('##### Shell #####')
!head -n 2 popular-names.txt

##### Python #####
Mary	F	7065	1880
Anna	F	2604	1880

##### Shell #####
Mary	F	7065	1880
Anna	F	2604	1880


In [21]:
# 15. 末尾のN行を出力

print('##### Python #####')
def tail(num):
    with open('popular-names.txt', 'r') as f:
        print(''.join(f.readlines()[-num:]))

tail(2)

print('##### Shell #####')
!tail -n 2 popular-names.txt

##### Python #####
Mason	M	12435	2018
Logan	M	12352	2018

##### Shell #####
Mason	M	12435	2018
Logan	M	12352	2018


In [22]:
# 16. ファイルをN分割する

print('##### Python #####')
from more_itertools import chunked

def split_(num):
    with open('popular-names.txt', 'r') as f:
        lines = f.readlines()
        num_lines = len(lines) // num
        splited = list(chunked(lines, num_lines))

    for i in range(len(splited)):
        filename = 'popular-names-split-python-' + str(i).zfill(2)
        with open(filename, 'w') as f:
            f.write(''.join(splited[i]))

split_(5)

print('##### Shell #####')
!split -d -l $(( $(wc -l popular-names.txt | cut -d ' ' -f 1) / 5 )) popular-names.txt popular-names-split-shell-

print('##### Test - ls #####')
!ls -l popular-names-split-*

print('##### Test - sum #####')
!sum -r popular-names-split-*

print('##### Test - lines #####')
!wc -l popular-names-split-*

print('##### Test - head #####')
!head popular-names-split-python-00

##### Python #####
##### Shell #####
##### Test - ls #####
-rw-r--r-- 1 root root 10423 Apr 14 13:37 popular-names-split-python-00
-rw-r--r-- 1 root root 11016 Apr 14 13:37 popular-names-split-python-01
-rw-r--r-- 1 root root 11013 Apr 14 13:37 popular-names-split-python-02
-rw-r--r-- 1 root root 11304 Apr 14 13:37 popular-names-split-python-03
-rw-r--r-- 1 root root 11270 Apr 14 13:37 popular-names-split-python-04
-rw-r--r-- 1 root root 10423 Apr 14 13:37 popular-names-split-shell-00
-rw-r--r-- 1 root root 11016 Apr 14 13:37 popular-names-split-shell-01
-rw-r--r-- 1 root root 11013 Apr 14 13:37 popular-names-split-shell-02
-rw-r--r-- 1 root root 11304 Apr 14 13:37 popular-names-split-shell-03
-rw-r--r-- 1 root root 11270 Apr 14 13:37 popular-names-split-shell-04
##### Test - sum #####
32580    11 popular-names-split-python-00
50306    11 popular-names-split-python-01
39088    11 popular-names-split-python-02
15531    12 popular-names-split-python-03
26989    12 popular-names-split-pyt

In [26]:
# 17. １列目の文字列の異なり

print('##### Python #####')
with open('popular-names.txt', 'r') as f:
    uniq = {l.split()[0] for l in f.readlines()}

with open('popular-names-col1-uniq-python.txt', 'w') as f:
    f.write('\n'.join(sorted(list(uniq))))

print('##### Shell #####')
!cut -f 1 popular-names.txt | sort | uniq > popular-names-col1-uniq-shell.txt

print('##### Test - diff #####')
!diff popular-names-col1-uniq-python.txt popular-names-col1-uniq-shell.txt

print('##### Test - head #####')
!head popular-names-col1-uniq-python.txt

##### Python #####
##### Shell #####
##### Test - diff #####
136c136
< William
\ No newline at end of file
---
> William
##### Test - head #####
Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


In [24]:
# 18. 各行を3コラム目の数値の降順にソート

# この問題はコマンドで実行した時の結果と合わなくてもよい

print('##### Python #####')
with open('popular-names.txt', 'r') as f:
    orig = f.readlines()
    sorted_ = sorted(orig, key=lambda x: int(x.split()[2]), reverse=True)
    print(''.join(sorted_[:10]))

print('##### Shell #####')
!sort -k 3 -r -n popular-names.txt | head

##### Python #####
Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954

##### Shell #####
Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954


In [25]:
# 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる

print('##### Python #####')
import collections

with open('popular-names.txt', 'r') as f:
    names = [l.split()[0] for l in f.readlines()]
    c = collections.Counter(names)
    sorted_ = c.most_common()

res = '\n'.join(['{:>7} {}'.format(i[1], i[0]) for i in sorted_])

with open('popular-names-col1-uniq-c-python.txt', 'w') as f:
    f.write(res)

print('##### Shell #####')
!cut -f 1 popular-names.txt | sort | uniq -c | sort -n -r > popular-names-col1-uniq-c-shell.txt

print('##### Test - head - python #####')
!head popular-names-col1-uniq-c-python.txt

print('##### Test - head - shell #####')
!head popular-names-col1-uniq-c-shell.txt

##### Python #####
##### Shell #####
##### Test - head - python #####
    118 James
    111 William
    108 John
    108 Robert
     92 Mary
     75 Charles
     74 Michael
     73 Elizabeth
     70 Joseph
     60 Margaret
##### Test - head - shell #####
    118 James
    111 William
    108 Robert
    108 John
     92 Mary
     75 Charles
     74 Michael
     73 Elizabeth
     70 Joseph
     60 Margaret
