In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# Any results you write to the current directory are saved as output.

In [None]:
# Location of the training images

BASE_PATH = '../input/prostate-cancer-grade-assessment'

# image and mask directories
data_dir = f'{BASE_PATH}/train_images'
mask_dir = f'{BASE_PATH}/train_label_masks'

In [None]:
# Location of training labels
train = pd.read_csv(f'{BASE_PATH}/train.csv').set_index('image_id')
test = pd.read_csv(f'{BASE_PATH}/test.csv')
submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

In [None]:
display(train.head())
print("Shape of training data :", train.shape)
print("unique data provider :", len(train.data_provider.unique()))
print("unique isup_grade(target) :", len(train.isup_grade.unique()))
print("unique gleason_score :", len(train.gleason_score.unique()))

In [None]:
train.isna().sum()

In [None]:
display(test.head())
print("Shape of training data :", test.shape)
print("unique data provider :", len(test.data_provider.unique()))

In [None]:
def plot_count(df, feature, title='', size=2):
    # fがFigureオブジェクト, axがAxesオブジェクト
    f, ax = plt.subplots(1,1, figsize=(4*size,3*size))
    
    # dataの総数
    total = float(len(df))
    
    # グラフを作成している
    sns.countplot(df[feature],order = df[feature].value_counts().index, palette='Set2')
    plt.title(title)
    
    # Rectangleオブジェクトのリストでfor文を回している。(Rectangleオブジェクトはいわばグラフに使われる長方形の情報)
    # for文内の目的は、ヒストグラムの％を数値で図中に表示すること。
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

In [None]:
# dataのproviderについての分布
plot_count(df=train, feature='data_provider', title = 'data_provider count and %age plot')

In [None]:
# ISUPについての分布
plot_count(df=train, feature='isup_grade', title = 'isup_grade count and %age plot')

In [None]:
# GLEASONスコアについての分布
plot_count(df=train, feature='gleason_score', title = 'gleason_score count and %age plot', size=3)

In [None]:
def plot_relative_distribution(df, feature, hue, title='', size=2):
    f, ax = plt.subplots(1,1, figsize=(4*size,3*size))
    total = float(len(df))
    
    # hueは各軸をさらに分割化する場合、使用する列名を指定
    sns.countplot(x=feature, hue=hue, data=df, palette='Set2')
    plt.title(title)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

In [None]:
# ISUPのグレードとdataのprovider先との相関関係
plot_relative_distribution(df=train, feature='isup_grade', hue='data_provider', title = 'relative count plot of isup_grade with data_provider', size=2)

In [None]:
# GLEASONスコアとdataのprovider先との相関関係
plot_relative_distribution(df=train, feature='gleason_score', hue='data_provider', title = 'relative count plot of gleason_score with data_provider', size=3)


In [None]:
# ISUPのグレードとGLEASONスコアとの相関関係
plot_relative_distribution(df=train, feature='isup_grade', hue='gleason_score', title = 'relative count plot of isup_grade with gleason_score', size=3)
