In [None]:
# %tensorflow_version 2.x
# import tensorflow as tf
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Feb2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap
! pip install umap-learn

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2


# パスの定義
CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']
TARGET_ENCODING = yml['SETTING']['TARGET_ENCODING']

## 次元削減

In [None]:
! python ../code/reduce_dimension.py

## 予測結果の取得

In [None]:
REVERSE_TARGET_ENCODING = {v:k for k, v in TARGET_ENCODING.items()}

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test =  pd.read_csv(RAW_DIR_NAME + 'test.csv')

# model_name = 'lgb_0211_0656'
# pred = pd.read_pickle(MODEL_DIR_NAME + model_name + f'/{model_name}-pred.pkl')
# pred[0] = pred[0].map(lambda x: REVERSE_TARGET_ENCODING[x])
# pred = pd.get_dummies(pred, prefix='', prefix_sep='')

In [None]:
# 多クラス分類(実際の値と予測値の割合算出）
true_and_pred = pd.concat([train['target'], pred], axis=1)
true_and_pred = true_and_pred.iloc[:100000, :].groupby('target').sum()
true_and_pred = true_and_pred.apply(lambda x: x / x.sum(), axis=1)
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(true_and_pred, annot=True, ax=ax)

In [None]:
plt.plot(train.iloc[0, 1:-1])

In [None]:
plt.plot(train.iloc[100, 1:-1])

## クラスター

In [None]:
# （必要に応じて）データ分割
debug = True
if debug==True:
  from sklearn.model_selection import train_test_split
  test_size = 2000 / len(train)
  _, train_x, _, train_y = train_test_split(train.iloc[:, 1:-1], train['target'], test_size=test_size, stratify=train['target'])
else:
  train_x, train_y = train.iloc[:, 1:-1], train['target']

# ターゲットラベル
encorded_targets = train_y.map(lambda x: TARGET_ENCODING[x]).values
true_colors = [plt.get_cmap("tab10")(x) for x in encorded_targets]

# データの使用する列のみ抽出
dfs = train_x

### kmeans

In [None]:
# kmeansでグループ分けして、各グループごとにcolorを割り振る
from sklearn.cluster import KMeans

k = 10
SEED = 42

kmeans_model = KMeans(n_clusters=k, random_state=SEED).fit(dfs.select_dtypes(exclude='object'))
labels = kmeans_model.labels_
kmeans_colors = [plt.get_cmap("tab10")(x) for x in labels]

In [None]:
# エルボー法（クラスター数の決め方1）
distortions = []
num = 20

for i  in range(1,num+1):
    km = KMeans(n_clusters=i,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=0)
    km.fit(dfs.select_dtypes(exclude='object'))
    distortions.append(km.inertia_)   # km.fitするとkm.inertia_が得られる

plt.plot(range(1,num+1),distortions,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
# シルエット法（クラスター数の決め方1）
from sklearn.metrics import silhouette_samples
from matplotlib import cm

SEED = 42
num = 20
fig = plt.figure(figsize=(10,20))

for k in range(2, num+1):
    
    kmeans_num = KMeans(n_clusters=k, random_state=SEED).fit_predict(dfs.select_dtypes(exclude='object'))

    cluster_labels = np.unique(kmeans_num)       # y_kmの要素の中で重複を無くす
    n_clusters=cluster_labels.shape[0]     # 配列の長さを返す。つまりここでは n_clustersで指定した3となる

    # シルエット係数を計算
    silhouette_vals = silhouette_samples(dfs.select_dtypes(exclude='object'),
                                     kmeans_num,metric='euclidean')  # サンプルデータ, クラスター番号、ユークリッド距離でシルエット係数計算
    y_ax_lower, y_ax_upper= 0,0
    yticks = []

    # グラフ
    plt.subplot((num//3+1), 3, k-1)
    
    for i,c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[kmeans_num==c]      # cluster_labelsには 0,1,2が入っている（enumerateなのでiにも0,1,2が入ってる（たまたま））
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)              # サンプルの個数をクラスターごとに足し上げてy軸の最大値を決定
        color = cm.jet(float(i)/n_clusters)               # 色の値を作る
        plt.barh(range(y_ax_lower,y_ax_upper),            # 水平の棒グラフのを描画（底辺の範囲を指定）
                         c_silhouette_vals,               # 棒の幅（1サンプルを表す）
                         height=1.0,                      # 棒の高さ
                         edgecolor='none',                # 棒の端の色
                         color=color)                     # 棒の色
        yticks.append((y_ax_lower+y_ax_upper)/2)          # クラスタラベルの表示位置を追加
        y_ax_lower += len(c_silhouette_vals)              # 底辺の値に棒の幅を追加

    silhouette_avg = np.mean(silhouette_vals)                 # シルエット係数の平均値
    plt.axvline(silhouette_avg,color="red",linestyle="--")    # 係数の平均値に破線を引く
    plt.title(f"cluster {k}")
    plt.yticks(yticks,cluster_labels + 1)                     # クラスタレベルを表示
    plt.ylabel('Cluster')
    plt.xlabel('silhouette coefficient')

plt.tight_layout() # グラフ間の隙間を調整して座標部分が重なりを解消できる

### pca

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
data = train.iloc[:300, :]
feature = pca.fit_transform(data.drop(['row_id'], axis=1).select_dtypes(exclude='object'))

data_size = len(data)
target = 'target'
# data[target] = data[target].map(lambda x: TARGET_ENCODING[x]).values

# グラフ化
# fig, axes = plt.subplots(1, 2, figsize=(6, 6))
# sns.scatterplot(feature[:data_size, 0], feature[:data_size, 1], alpha=0.8, hue=target, data=data, ax=axes[0,0])

# fig, axes = plt.subplots(1, 1, figsize=(6, 6))
# sns.lineplot([n for n in range(1, len(pca.explained_variance_ratio_)+1)], np.cumsum(pca.explained_variance_ratio_), markers=True, ax=axes)
# plt.show()

### t-sne & umap

In [None]:
# t-sne
from sklearn.manifold import TSNE

fig, ax = plt.subplots(figsize=(6,6))

i = 10

plt.title(f"perplexity: {i}")

model_tsne = TSNE(n_components=2, perplexity=i)
feature = model_tsne.fit_transform(dfs)

ax.scatter(feature[:, 0], feature[:, 1], alpha=0.8, color=true_colors)

## lda

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
# all_data = pd.concat([train, test])

lda = LDA(n_components=2)
lda.fit(train_x, encorded_targets)

# lda.transform(b)
# test_x = lda.transform(test.iloc[:, 1:])

In [None]:
from keras.utils import np_utils

In [None]:
# lda.predict(train_x)
np_utils.to_categorical(encorded_targets)
lda.predict(encorded_targets)

In [None]:
!pip install umap-learn

In [None]:
# umap
import umap.umap_ as umap
from scipy.sparse.csgraph import connected_components

fig, ax = plt.subplots(figsize=(10,10))

i = 10

ax.set_title(f"n_neighbors: {i}")

# model_umap = umap.UMAP(n_components=2, n_neighbors=i) 
# feature = model_umap.fit_transform(dfs.select_dtypes(exclude='object'))

sns.scatterplot(feature[:, 0], feature[:, 1], alpha=0.8, hue='target', data=pd.DataFrame(train_y), ax=ax)
# plt.show()


In [None]:
# ! python ../code/generate_feature.py

## 次元削減