In [12]:
import numpy as np
from sklearn.metrics import mean_squared_error

# RMSE関数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# RMSLE関数
def rmsle(y_true: np.ndarray, y_pred: np.ndarray):
    rmsle = mean_squared_error(np.log1p(y_true), np.log1p(y_pred))
    return np.sqrt(rmsle)

In [13]:
# データを準備
y_true = np.array([1000, 1000])
y_pred_low = np.array([600, 600])
y_pred_high = np.array([1400, 1400])

# RMSEを出力
print('RMSE')
print(rmse(y_true, y_pred_high))
print(rmse(y_true, y_pred_low))

print('--------------------')

# RMSLEを出力
print('RMSLE')
print(rmsle(y_true, y_pred_high))
print(rmsle(y_true, y_pred_low))

RMSE
400.0
400.0
--------------------
RMSLE
0.3361867670217862
0.5101598447800129


In [15]:
y_true = np.array([1000, 1000])
y_pred = np.array([1500, 1500])

print(f'RMSLE: {rmsle(y_true, y_pred)}')


y_true = np.array([100000, 100000])
y_pred = np.array([100500, 100500])

print(f'RMSLE: {rmsle(y_true, y_pred)}')

RMSLE: 0.40513205231824134
RMSLE: 0.004987491760291007


In [21]:
import sklearn as sklearn

In [22]:
sklearn.metrics.mean_squared_log_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average')

2.4875074058970692e-05

In [23]:
# MAE
sklearn.metrics.mean_absolute_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average')

500.0

In [24]:
# r square
sklearn.metrics.r2_score(y_true, y_pred, sample_weight=None, multioutput='uniform_average')

0.0

In [25]:
# accuracy
sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

0.0

In [27]:
# precision
# sklearn.metrics.precision_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

In [28]:
# recall
# sklearn.metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

In [29]:
# f1 score
# sklearn.metrics.f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

In [30]:
# f beta score
# sklearn.metrics.fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

In [31]:
# log loss
import numpy as np
import math

def logloss(true_label, predicted, eps=1e-15):
    # 要素の値を任意の範囲内に収める
    p = np.clip(predicted, # 処理するデータ
                eps, # 最小値
                1 - eps # 最大値
                )

    if true_label == 1:
        return -math.log(p)
    else:
        return -math.log(1 - p)


In [32]:
logloss(true_label=1, predicted=0.9)

0.10536051565782628

In [33]:
logloss(true_label=1, predicted=0.5)

0.6931471805599453

In [34]:
logloss(true_label=0, predicted=0.2)

0.2231435513142097

In [36]:
# log loss
# sklearn.metrics.log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None)

In [37]:
# auc
# sklearn.metrics.roc_auc_score(y_true, y_score, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None)

In [38]:
# multi-class accuracy
# sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

In [40]:
# multi class log loss
import numpy as np
from sklearn.metrics import log_loss

# [クラス1の正解, クラス2の正解, クラス3の正解]
y_true = np.array([0, 1, 2])

# 予測確率[クラス1, クラス3, クラス3]
y_pred = np.array([[0.55, 0.45, 0.00],
                   [0.85, 0.00, 0.15],
                   [0.25, 0.75, 0.00]])

log_loss(y_true, y_pred)

23.225129930192328

In [42]:
# mean f1, macro f1, micro f1
# sklearn.metrics.f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')

In [43]:
import numpy as np
from sklearn.metrics import f1_score

y_true = np.array([[1, 2], [1], [1, 2, 3]])

y_true = np.array([[1, 1, 0],
                   [1, 0, 0],
                   [1, 1, 1]])
y_pred = np.array([[1, 0, 1],
                   [0, 1, 0],
                   [1, 0, 1]])
print('Mean-F1 :', f1_score(y_true, y_pred, average='samples'))
print('Macro-F1 :', f1_score(y_true, y_pred, average='macro'))
print('Micro-F1 :', f1_score(y_true, y_pred, average='micro'))

Mean-F1 : 0.43333333333333335
Macro-F1 : 0.48888888888888893
Micro-F1 : 0.5454545454545454


In [44]:
# quadratic weighted kappa
# sklearn.metrics.cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None)

In [45]:
from sklearn.metrics import cohen_kappa_score

y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]

cohen_kappa_score(y_true, y_pred, weights='quadratic')

0.5454545454545454

In [47]:
import pandas as pd
import pandas_profiling

train = pd.read_csv('titanic/train.csv')

In [48]:
train.profile_report()



In [49]:
# mnist
from tensorflow.keras.datasets import mnist

# mnistデータセットをnumpy配列に読み込む
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# 1番目の画像の6行目のデータを出力
print(x_train[0][5])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
 175  26 166 255 247 127   0   0   0   0]


In [50]:
(x_train / 255.0)[0][5]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01176471, 0.07058824, 0.07058824,
       0.07058824, 0.49411765, 0.53333333, 0.68627451, 0.10196078,
       0.65098039, 1.        , 0.96862745, 0.49803922, 0.        ,
       0.        , 0.        , 0.        ])

In [51]:
# 正規化
import numpy as np

xmean = x_train.mean() # 平均を求める
xstd = np.std(x_train) # 標準偏差を求める
# 訓練データwお標準化して1番目の画像の6行目のデータを出力
((x_train - xmean)/xstd)[0][5]

array([-0.42407389, -0.42407389, -0.42407389, -0.42407389, -0.42407389,
       -0.42407389, -0.42407389, -0.42407389, -0.42407389, -0.42407389,
       -0.42407389, -0.42407389, -0.38589016, -0.1949715 , -0.1949715 ,
       -0.1949715 ,  1.17964286,  1.30692197,  1.80331049, -0.09314822,
        1.68875929,  2.82154335,  2.71972006,  1.19237077, -0.42407389,
       -0.42407389, -0.42407389, -0.42407389])

In [52]:
# 対数変換
import numpy as np

x = ([1.0, 10.0, 100.0, 1000.0, 10000.0])
np.log1p(x)

array([0.69314718, 2.39789527, 4.61512052, 6.90875478, 9.21044037])

In [53]:
# label encoding
from sklearn.preprocessing import LabelEncoder

data = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'A1', 'A2', 'A3']
le = LabelEncoder() # LabelEncoderを生成
le.fit(data) # LabelEncoderを初期化
print(le.classes_) # 生成されたラベルを確認

['A1' 'A2' 'A3' 'B1' 'B2' 'B3']


In [54]:
print(le.transform(data))

[0 1 2 3 4 5 0 1 2]


In [56]:
# one-hot-encoding
from sklearn.preprocessing import OneHotEncoder

# Numpy配列
df = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# sparse=Falseを指定して戻り値を配列形式にする
ohe = OneHotEncoder(sparse=False)

# 変換する際はデータを2次元配列にする必要がある
print(ohe.fit_transform(df.reshape(-1, 1)))

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [57]:
# テキスト形式のカテゴリデータをOne-hot-encodingする
data = np.array(['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'A1', 'A2', 'A3'])
ohe = OneHotEncoder(sparse=False)

print(ohe.fit_transform(data.reshape(-1, 1)))

[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [58]:
# テキストデータの前処理
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?'
]

vectorizer = CountVectorizer()

# Bag-of-Wordsを実行し変換後の行列を取得
X = vectorizer.fit_transform(corpus)

# 戻り値はscipy.sparseの疎行列なのでnumpy配列に変換して出力
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [59]:
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [60]:
# n-gram
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?'
]

vectorizer = CountVectorizer(
                                analyzer='word', # 単語単位のN-gramを指定
                                ngram_range=(2, 2) # 2-gramにする
)

# 変換後の行列を取得
X = vectorizer.fit_transform(corpus)

# 戻り値はscipy.sparseの疎行列なのでnumpy配列に変換して出力
X.toarray()

array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]])

In [61]:
vectorizer.vocabulary_

{'this is': 11,
 'is the': 3,
 'the first': 6,
 'first document': 2,
 'this document': 10,
 'document is': 1,
 'the second': 7,
 'second document': 5,
 'and this': 0,
 'the third': 8,
 'third one': 9,
 'is this': 4,
 'this the': 12}

In [65]:
# tf-idf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?'
]

vectorizer = CountVectorizer()
transformer = TfidfVectorizer()

# 変換後の行列を取得
tf = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(corpus)

# 戻り値はscipy.sparseの疎行列なのでnumpy配列に変換して出力
tfidf.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [66]:
tf.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [68]:
# embedding
from gensim.models import word2vec

corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?'
]

# センテンスごとにリストにする
sentence = [d.split() for d in corpus]

# トレーニング
model = word2vec.Word2Vec(
                            sentence,
                            size=10,
                            min_count=1,
                            window=2
)

model.wv['This']

array([ 0.00195455,  0.04841953, -0.04152495,  0.0363373 ,  0.04004268,
       -0.02447906,  0.03691939,  0.00689007, -0.04414196, -0.00370985],
      dtype=float32)

In [69]:
model.wv['is']

array([-0.0054072 ,  0.00474548,  0.03214339, -0.03145517, -0.02716509,
        0.03210245,  0.01056504, -0.0053102 ,  0.02436956,  0.02892346],
      dtype=float32)

In [70]:
model.wv.most_similar('document')

[('the', 0.6424844861030579),
 ('document.', 0.1540016531944275),
 ('this', 0.12448224425315857),
 ('document?', 0.12419357150793076),
 ('Is', 0.061624035239219666),
 ('second', -0.040286123752593994),
 ('And', -0.043175455182790756),
 ('first', -0.10677105188369751),
 ('This', -0.14978046715259552),
 ('is', -0.1907077133655548)]

chapter02.ipynb
[34mhouse-prices-advanced-regression-techniques[m[m/
[34mtitanic[m[m/
