<a href="https://colab.research.google.com/github/Naoki0424/TPS_202110/blob/develop/tensorflow_decision_forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environmental Setting

In [None]:
!pip3 install tensorflow_decision_forests --upgrade

In [None]:
!pip install wandb -qqq

# Library Import

In [None]:
import os
import wandb
import logging
import datetime
import warnings
import gc

import numpy as np
import pandas as pd


from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

import tensorflow_decision_forests as tfdf

sns.set_style('whitegrid')
sns_params = {"palette": sns.color_palette(["#2a9d8f", "#e9c46a"])}

# W&B Setting

In [None]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("api_key")
    wandb.login(key=secret_value_0)
    anony=None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')
    
CONFIG = dict(competition = 'TPSOctober',_wandb_kernel = 'tensorgirl')

# Load Data

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
test= pd.read_csv('drive/My Drive/Colab Notebooks/TPS_202110/data/test.csv', nrows = 3000)
train= pd.read_csv('drive/My Drive/Colab Notebooks/TPS_202110/data/train.csv', nrows = 3000)

In [None]:
test.shape

# Observations

There are no missing values in both train ans test dataset.
The train consists of 1000000 data, and the test consists of 500000 data.
The binary features are from f22, f43, f242~f284 and rest of the features are continuous .


Source : https://www.kaggle.com/subinium/tps-oct-simple-eda

In [None]:
train.loc[:, 'f0':'f284'].describe().style.background_gradient(cmap='Pastel1')

In [None]:
plt.figure(figsize=(15, 7))
sns.kdeplot(train["target"] ,fill=True, color = "#2a9d8f")


In [None]:
plt.figure(figsize=(15, 7))
plt.pie([508,492], labels = ["0" , "1"],autopct='%1.1f%%',colors = ["#2a9d8f", "#e9c46a"])

In [None]:
#code copied from https://www.kaggle.com/subinium/tps-oct-simple-eda

fig, axes = plt.subplots(11,11,figsize=(12, 12))
axes = axes.flatten()
sns.set_palette(sns.color_palette(["#2a9d8f", "#e9c46a"]))

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f{idx}',ax=ax,palette = ["#2a9d8f"])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

# fig.supxlabel('Average by class (by feature f0-f120)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
#code copied from https://www.kaggle.com/craigmthomas/tps-oct-2021-eda

cat_features = ["f22", "f43"]
cat_features.extend(["f{}".format(x) for x in range(242, 285)])

fig, axs = plt.subplots(11, 4, figsize=(4*4, 11*3), squeeze=False, sharey=True)

ptr = 0
for row in range(11):
    for col in range(4):  
        x = train[[cat_features[ptr], "target"]].value_counts().sort_index().to_frame().rename({0: "# of Samples"}, axis="columns").reset_index()
        sns.barplot(x=cat_features[ptr], y="# of Samples", hue="target", data=x, ax=axs[row][col], **sns_params)
        plt.xlabel(cat_features[ptr])
        ptr += 1
        del(x)
plt.tight_layout()    
plt.show()

_ = gc.collect()

# W & B Artifacts

In [None]:
# Save train data to W&B Artifacts
# run = wandb.init(project='TPSOctober', name='training_data', anonymous=anony,config=CONFIG) 
# artifact = wandb.Artifact(name='training_data',type='dataset')
# artifact.add_file("drive/My Drive/Colab Notebooks/TPS_202110/data/train.csv")

# wandb.log_artifact(artifact)
# wandb.finish()

# Logging to W & B environment

In [None]:
# Log Plots to W&B environment
title = "Distribution of Target Feature"
run = wandb.init(project='TPSOctober', name=title,anonymous=anony,config=CONFIG)
fig = sns.kdeplot(train["target"] , color = "#E4916C")
wandb.log({"Distribution of Target Feature": fig})
wandb.finish()

# Tensorflow Decision Forests

In [None]:
# 訓練データを訓練_訓練データと訓練_テストデータに分割する

# データ確認
# train.head(1)

# 列数取得
print('列数：{}'.format(len(train.columns)))

# 目的変数取得
y = train['target']
# 説明変数取得
X = train.iloc[:, 0:286]

# 分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111)

In [None]:
# 件数確認
print('列数：{}'.format(len(X_train.columns)))
print('列数：{}'.format(len(y_train.shape)))

# pd.concat([X_train, y_train], axis=1)

In [None]:
# 訓練データ
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(pd.concat([X_train, y_train], axis=1), label="target")
# テストデータ
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_test)

# モデル作成
model = tfdf.keras.RandomForestModel()
# 学習
model.fit(train_ds)

# 予測
output = model.predict(test_ds)

In [None]:
# np.round(output).astype('int')

In [None]:
# print(type(output))
# print(type(y_test))
# print(np.round(output).astype('int').dtype)
# print(y_test.dtypes)

# どれくらい正解したか確認
print(classification_report(np.round(output).astype('int'), y_test))

In [None]:
sns.histplot(pd.DataFrame(output),legend = False)