## 구글 드라이브 연동

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#data handling

## 데이터 프레임 타입별로 분리 요약

In [None]:
a = df.columns.to_series().groupby(df.dtypes).groups
for i,v in a.items():
    print(i,v)

In [None]:
def resumetable(df):
    print(df.shape)
    summary = pd.DataFrame(df.dtypes, columns=['데이터타입']) #.sort_values()
    summary = summary.reset_index()
    summary = summary.rename(columns = {'index':'피처'})
    summary['결측치 개수'] = df.isnull().sum().values
    summary['고윳값 개수'] = df.nunique().values
    summary['첫 번째 값'] = df.loc[0].values
    summary['두 번째 값'] = df.loc[1].values
    summary['세 번째 값'] = df.loc[2].values


    return summary

##read_data

In [None]:
import glob
path = '/content/content/MyDrive/data/bike-sharing-demand'
files = glob.glob(path + '/*.csv')
for file in files:
    if 'train' in file:
        train = pd.read_csv(file)
    elif 'test' in file:
        test = pd.read_csv(file)
    elif 'sample' in file:
        sub = pd.read_csv(file)
train.shape, test.shape, sub.shape

#시각화

##target에 대한 정규성 검증 qqplot 함수

In [None]:
#sapiro Normality Test
from scipy import stats
#qqplot 
from scipy.stats import probplot

def shapiro_qq_plot(df):
    '''
    Args:
        df['target']

    Returns:
        qq_plot and Shapiro statics of df['target']
        with [raw, sqrt, log, log1p] transformations
    '''
    
    # 경고메세지 끄기
    import warnings
    warnings.filterwarnings(action='ignore')

    f, axs = plt.subplots(2,2,figsize=(10,10))
    f.suptitle('shapiro_qq_plot', fontsize=25)

    df_list = [(df,'RAW'), (np.sqrt(df),'SQRT'), (np.log(df),'LOG'), (np.log1p(df),'LOG1P')]

    for data, ax in zip(df_list, axs.ravel()):
        #qq plot 그리기
        probplot(data[0], dist=stats.norm(), plot=ax)

        #정규화 변환 방법 title로 설정
        ax.set_title(data[1], size=20)

        #shapiro 통계량, p-value 반올림 xlabel설정
        statics = np.round(stats.shapiro(data[0])[:],3)
        ax.set_xlabel(f'static:{statics[0]}, p-value: {statics[1]}',size=15)

    
    plt.tight_layout()
    plt.show()

## bar그래프에 수치 넣기 함수

In [None]:
def write_percent(ax, total_size):
    for patch in ax.patches:
        height = patch.get_height()
        width = patch.get_width()
        left_coord = patch.get_x()
        percent = height / total_size * 100

        ax.text(x = left_coord + width / 2.0,
                y = height + total_size * 0.001,
                s = f'{percent:1.1f}%',
                ha = 'center')

##gridspec 사용법

In [None]:
import matplotlib.gridspec as gridspec

# 틀 준비
mpl.rc('font', size = 12)
grid = gridspec.GridSpec(3, 2)
plt.figure(figsize=(10,16))
plt.subplots_adjust(wspace=0.4, hspace=0.3)

# 서브플롯 그리기
bin = ['bin_0','bin_1','bin_2','bin_3','bin_4']

for idx, feature in enumerate(bin):
    ax = plt.subplot(grid[idx])

    sns.countplot(x = feature,
                  data = df,
                  hue = 'target',
                  palette = 'pastel',
                  ax = ax)
    ax.set_title(f'{feature} Distribution by Target')
    write_percent(ax, len(df))

## pointplot taget 비율

In [None]:
def get_crosstab(df, feature):
    crosstab = pd.crosstab(df[feature], df['target'], normalize='index') * 100
    crosstab = crosstab.reset_index()
    return crosstab

def plot_pointplot(ax, feature, crosstab):
    ax2 = ax.twinx() # x축은 공유하고 y축은 공유하지 않는 새로운 축 생성
    # 새로운 축에 포인트플롯 그리기
    ax2 = sns.pointplot(x=feature, y=1, data=crosstab,
                        order=crosstab[feature].values, # 포인트플롯 순서
                        color='black',                  # 포인트플롯 색상
                        legend=False)                   # 범례 미표시
    ax2.set_ylim(crosstab[1].min()-5, crosstab[1].max()*1.1) # y축 범위 설정
    ax2.set_ylabel('Target 1 Ratio(%)')

def write_percent(ax, total_size):
    for patch in ax.patches:
        height = patch.get_height()
        width = patch.get_width()
        left_coord = patch.get_x()
        percent = height / total_size * 100

        ax.text(x = left_coord + width / 2.0,
                y = height + total_size * 0.001,
                s = f'{percent:1.1f}%',
                ha = 'center')

def plot_cat_dist_with_true_ratio(df, features, num_rows, num_cols, 
                                  size=(15, 20)):
    plt.figure(figsize=size)  # 전체 Figure 크기 설정
    grid = gridspec.GridSpec(num_rows, num_cols) # 서브플롯 배치
    plt.subplots_adjust(wspace=0.45, hspace=0.3) # 서브플롯 좌우/상하 여백 설정
    
    for idx, feature in enumerate(features): 
        ax = plt.subplot(grid[idx])
        crosstab = get_crosstab(df, feature) # 교차분석표 생성

        # ax축에 타깃값 분포 카운트플롯 그리기
        sns.countplot(x=feature, data=df,
                      order=crosstab[feature].values,
                      color='skyblue',
                      ax=ax)

        write_percent(ax, len(df)) # 비율 표시
       
        plot_pointplot(ax, feature, crosstab) # 포인트플롯 그리기
        
        ax.set_title(f'{feature} Distribution') # 그래프 제목 설정