import libraries

In [5]:
import pandas as pd

from janome.tokenizer import Tokenizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# 特徴量エンジニアリング

## 特徴量エンジニアリングとは

### 質的変数の処理
- LabelEncoding
- OneHotEncoding

In [10]:
df = pd.DataFrame([
    ["Cola", "S"],
    ["Cola", "M"],
    ["Green Tea", "L"],
    ["Milk", "M"]
], columns=["drink", "size"])
df.head()

Unnamed: 0,drink,size
0,Cola,S
1,Cola,M
2,Green Tea,L
3,Milk,M


順序特徴量の変換

In [12]:
df["size"].unique()

array(['S', 'M', 'L'], dtype=object)

In [13]:
size2int = {"S": 0, "M": 1, "L": 2}
df["size"] = df["size"].map(size2int)
df.head()

Unnamed: 0,drink,size
0,Cola,0
1,Cola,1
2,Green Tea,2
3,Milk,1


In [14]:
df["size"].unique()

array([0, 1, 2])

名義特徴量の変換

In [17]:
encoder = LabelEncoder()
df["drinklabel"] = encoder.fit_transform(df["drink"])
df.head()

Unnamed: 0,drink,size,drinklabel
0,Cola,0,0
1,Cola,1,0
2,Green Tea,2,1
3,Milk,1,2


`LabelEncoding`をすると、カテゴリ変数を数値に変換できるが、順序を持つ値になってしまう（変換された値の大きさには意味はない）  
そこで、`OneHotEncoding`をする

In [19]:
pd.get_dummies(df.drop("drinklabel", axis=1))

Unnamed: 0,size,drink_Cola,drink_Green Tea,drink_Milk
0,0,1,0,0
1,1,1,0,0
2,2,0,1,0
3,1,0,0,1


### 量的変数の処理
- 二値化 binalization
- 丸め rounding

In [20]:
df = pd.DataFrame([
    [ 7.2500, ],
    [71.2833, ],
    [ 7.9250, ],
    [53.1000, ]
], columns=["Fare", ])
df.head()

Unnamed: 0,Fare
0,7.25
1,71.2833
2,7.925
3,53.1


二値化

In [21]:
df.Fare > 10

0    False
1     True
2    False
3     True
Name: Fare, dtype: bool

In [22]:
(df.Fare > 10).astype(int)

0    0
1    1
2    0
3    1
Name: Fare, dtype: int64

丸め

In [23]:
df["Fareint"] = df["Fare"].round().astype(int)
df.head()

Unnamed: 0,Fare,Fareint
0,7.25,7
1,71.2833,71
2,7.925,8
3,53.1,53


## テキストのベクトル表現
2つのステップからなる  
1. 単語分割 -> §4.2参照(形態素解析)
2. ベクトル化 -> 本節
    1. N-gramベクトル
    2. シーケンスベクトル

### N-gramベクトル
連続するn個のトークン（単語や文字など）をn-gramという  
- n=1 : uni-gram (この場合のみBoWと呼ばれる)
- n=2 : bi-gram

In [3]:
type(vectorizer)

sklearn.feature_extraction.text.CountVectorizer

#### OneHot-EncodingによるBoW(Bag-of-Ngrams) -> CountVectorizer(binary=True)

In [2]:
vectorizer = CountVectorizer(binary=True) # binary=True -> 頻度を考慮しない
docs = ["the cat is out of the bag.", "dogs are"]
bow = vectorizer.fit_transform(docs)
type(bow)

scipy.sparse.csr.csr_matrix

In [30]:
print(vectorizer.vocabulary_) # sorted along with a-z
print(bow.toarray())

{'the': 7, 'cat': 2, 'is': 4, 'out': 6, 'of': 5, 'bag': 1, 'dogs': 3, 'are': 0}
[[0 1 1 0 1 1 1 1]
 [1 0 0 1 0 0 0 0]]


#### CountEncordingによるBoW(Bag-of-Ngrams) -> CountVectorizer(binary=False)

In [31]:
vectorizer = CountVectorizer(binary=False)
docs = ["the cat is out of the bag.", "dogs are"]
bow = vectorizer.fit_transform(docs)
type(bow)

scipy.sparse.csr.csr_matrix

In [32]:
print(vectorizer.vocabulary_) # sorted along with a-z
print(bow.toarray())

{'the': 7, 'cat': 2, 'is': 4, 'out': 6, 'of': 5, 'bag': 1, 'dogs': 3, 'are': 0}
[[0 1 1 0 1 1 1 2]
 [1 0 0 1 0 0 0 0]]


#### tf-idf(TermFrequency-InverseDocumentFrequency)によるBoW(Bag-of-Ngrams) -> TfidfVectorizer

In [36]:
vectorizer = TfidfVectorizer()
docs = ["the cat is out of the bag.", "dogs are"]
tfidf = vectorizer.fit_transform(docs)
type(tfidf)

scipy.sparse.csr.csr_matrix

In [37]:
print(vectorizer.vocabulary_) # sorted along with a-z
print(tfidf.toarray())

{'the': 7, 'cat': 2, 'is': 4, 'out': 6, 'of': 5, 'bag': 1, 'dogs': 3, 'are': 0}
[[0.         0.33333333 0.33333333 0.         0.33333333 0.33333333
  0.33333333 0.66666667]
 [0.70710678 0.         0.         0.70710678 0.         0.
  0.         0.        ]]


In [41]:
vocab = vectorizer.get_feature_names()
pd.DataFrame(tfidf.toarray(), columns=vocab).round(2)

Unnamed: 0,are,bag,cat,dogs,is,of,out,the
0,0.0,0.33,0.33,0.0,0.33,0.33,0.33,0.67
1,0.71,0.0,0.0,0.71,0.0,0.0,0.0,0.0


#### 日本語の場合
英語と異なり、空白で区切られていないため分かち書きをするための関数を渡す必要がある

In [43]:
t = Tokenizer(wakati=True)

In [49]:
vectorizer = CountVectorizer(tokenizer=t.tokenize)
docs = ["猫の子子猫", "獅子の子子獅子"]
bow = vectorizer.fit_transform(docs)
bow

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [45]:
vocab = vectorizer.get_feature_names()
pd.DataFrame(bow.toarray(), columns=vocab)

Unnamed: 0,の,子,子猫,猫,獅子
0,1,1,1,1,0
1,1,2,0,0,2


#### 語順を考慮したテキスト表現

bi-gram

In [51]:
vectorizer = CountVectorizer(ngram_range=(2,2)) 
# ngram_range=(min_n, max_n); we consider from {min_n}-gram to {max_n}-gram (include bounds).
docs = ["the cat is out of the bag.", "dogs are"]
bow = vectorizer.fit_transform(docs)
vectorizer.vocabulary_

{'the cat': 6,
 'cat is': 0,
 'is out': 2,
 'out of': 4,
 'of the': 3,
 'the bag': 5,
 'dogs are': 1}

In [52]:
vocab = vectorizer.get_feature_names()
pd.DataFrame(bow.toarray(), columns=vocab)

Unnamed: 0,cat is,dogs are,is out,of the,out of,the bag,the cat
0,1,0,1,1,1,1,1
1,0,1,0,0,0,0,0


uni- and bi-grams

In [53]:
vectorizer = CountVectorizer(ngram_range=(1,2)) 
# ngram_range=(min_n, max_n); we consider from {min_n}-gram to {max_n}-gram (include bounds).
docs = ["the cat is out of the bag.", "dogs are"]
bow = vectorizer.fit_transform(docs)
vectorizer.vocabulary_

{'the': 12,
 'cat': 2,
 'is': 6,
 'out': 10,
 'of': 8,
 'bag': 1,
 'the cat': 14,
 'cat is': 3,
 'is out': 7,
 'out of': 11,
 'of the': 9,
 'the bag': 13,
 'dogs': 4,
 'are': 0,
 'dogs are': 5}

In [54]:
vocab = vectorizer.get_feature_names()
pd.DataFrame(bow.toarray(), columns=vocab)

Unnamed: 0,are,bag,cat,cat is,dogs,dogs are,is,is out,of,of the,out,out of,the,the bag,the cat
0,0,1,1,1,0,0,1,1,1,1,1,1,2,1,1
1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0


## ベクトル表現の実践

<span style="color:red"><b>
    ここはscriptを作成した<br>
    なお、前回 Chapter04 と同じデータセット；`Amazon Customer Reviews Dataset - amazon_reviews_multilingual1_JP_v1_00.tsv`を使用した<br>
    [サポートサイトのgoogle colab](https://gist.github.com/Hironsan/1f1cc629613cbd7de042a7ce269b91d6)にterminalでDLするためのコードが載っていた
    </b></span>

In [57]:
try:
    pd.read_csv("data/amazon_reviews_multilingual_JP_v1_00.tsv", delimiter="\t").head()
except FileNotFoundError: 
    # after file move to src/scripts/chapter05/data
    pd.read_csv("src/scripts/chapter05/data/amazon_reviews_multilingual_JP_v1_00.tsv", delimiter="\t").head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,1,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,1,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,5,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,5,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,4,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10


## 特徴量のスケーリング

In [7]:
data = [[-1,2], [-0.5,6], [0,10], [1,18]]

### 正規化 Normalize

In [8]:
scaler = MinMaxScaler()
scaler.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

### 標準化 Standardize

In [9]:
scaler = StandardScaler()
scaler.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

## 特徴選択、特徴選択の実践

scriptを使用（`feature_selection.py`を新たに作成）