### 시각화 코드 실행 및 테스트

- 모든 모델 시각화 관련 코드 수행
- 출력된 시각화 결과는 ./data/images에 저장 후 readme 작성할때 삽입

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
import platform

import modules.DataSelect as DataSelect
import modules.DataAnalysis as DataAnalysis
import modules.ModelAnalysis as ModelAnalysis
import modules.DataModify as DataModify
from modules.DataModify import DataPreprocessing

import modules.Models as Models


In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
### matplotlib 에서 한글 및 음수 표현이 깨지는 현상 수정

system = platform.system()

if system == "Windows":     # Windows
    rcParams['font.family'] = 'Malgun Gothic'
elif system == "Darwin":    # macOS
    rcParams['font.family'] = 'AppleGothic'

rcParams['axes.unicode_minus'] = False

In [4]:
### 데이터 로드

input_file_path1 = './data/2022Data_part1.csv'
input_file_path2 = './data/2022Data_part2.csv'

df_part1 = pd.read_csv(input_file_path1)
df_part2 = pd.read_csv(input_file_path2)

df = pd.concat([df_part1, df_part2], ignore_index=True)

---

### 데이터 분석

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528403 entries, 0 to 528402
Data columns (total 26 columns):
 #   Column                                                       Non-Null Count   Dtype 
---  ------                                                       --------------   ----- 
 0   Patient ID                                                   528403 non-null  int64 
 1   Age recode with <1 year olds and 90+                         528403 non-null  object
 2   Sex                                                          528403 non-null  object
 3   Year of diagnosis                                            528403 non-null  int64 
 4   Year of follow-up recode                                     528403 non-null  int64 
 5   Race recode (W, B, AI, API)                                  528403 non-null  object
 6   Site recode ICD-O-3/WHO 2008                                 528403 non-null  object
 7   Primary Site                                                 528403 non-nu

In [6]:
# 출력이 100줄까지는 생략되지 않도록 조정
pd.set_option('display.max_rows', 100)

DataAnalysis.show_value_counts(df, boundary=100)

Patient ID
continuous
--------------------
Age recode with <1 year olds and 90+
65-69 years    88255
70-74 years    79677
60-64 years    76845
75-79 years    59940
55-59 years    59814
50-54 years    45671
80-84 years    36673
45-49 years    24381
85-89 years    17826
40-44 years    13739
35-39 years     7615
90+ years       6887
30-34 years     4463
25-29 years     2543
20-24 years     1497
15-19 years     1066
10-14 years      539
01-04 years      509
00 years         241
05-09 years      222
Name: count, dtype: int64
--------------------
Sex
Female    268082
Male      260321
Name: count, dtype: int64
--------------------
Year of diagnosis
2022    92262
2021    67451
2020    44168
2019    41631
2018    34840
2017    30350
2016    26561
2015    23472
2014    20637
2013    17888
2012    16358
2011    14732
2010    13635
2009    12708
2008    11665
2007    10610
2006     9660
2005     8643
2004     7634
2003     6844
2002     6180
2001     5526
2000     4948
Name: count, dtype: int64
--

In [11]:
# categorical한 데이터 encoding
def category_encoding(df, categories, encoding='label') :
    # encoding : 'label' - 라벨 인코딩, 'onehot' - One-hot 인코딩
    # categories = {'encoding_type':..., '컬럼명':{'요소':'라벨', '요소':'라벨', ...}, ...}
    # categorical한 데이터 = 데이터가 가지는 서로 다른 값이 15개 미만

    categorical_col = DataSelect.return_cols(df, 'categorical', boundary=100)       # Categorical한 column의 컬럼명을 선택

    df_encoded = df.copy() # 인코딩할 데이터

    if encoding == 'label' : # 라벨 형식으로 인코딩
        categories['encoding_type'] = 'label'

        for col in categorical_col:
            unique_vals = df_encoded[col].unique()
            label = {val: i for i, val in enumerate(unique_vals)}
            df_encoded[col] = df_encoded[col].map(label)
            categories[col] = label

    elif encoding == 'onehot' : # One-hot 인코딩
        categories['encoding_type'] = 'onehot'
        
        for col in categorical_col:
            dummies = pd.get_dummies(df_encoded[col], prefix=col)
            df_encoded = pd.concat([df_encoded.drop(columns=[col]), dummies], axis=1)
            categories[col] = dummies.columns.tolist()

    else:   # 이상한 값이 들어오면 Value Error 발생
        raise ValueError(f"알 수 없는 encoding_type: {encoding}")

    return df_encoded   # 인코딩된 데이터를 반환

# encoding된 데이터 decoding
def category_decoding(df = None, categories=None) :
    df_decoded = df.copy()

    if categories is not None :
        categories = categories

    encoding_type = categories.get('encoding_type', None)  # categories 데이터에 저장된 인코딩 타입에 따라 인코딩 타입 설정

    if encoding_type == 'label':    # 라벨 인코딩일 경우
        for col, mapping in categories.items():
            if col == 'encoding_type':
                continue
            # {원본값: 숫자} → {숫자: 원본값}
            reverse_map = {v: k for k, v in mapping.items()}
            df_decoded[col] = df_decoded[col].map(reverse_map)

    elif encoding_type == 'onehot': # One-hot 인코딩일 경우
        for col, dummy_cols in categories.items():
            if col == 'encoding_type':
                continue
            # 각 mapping column 에서 값이 True인 행을 찾아 역으로 mapping
            def decode_row(row):
                for dummy_col in dummy_cols:
                    if row[dummy_col] == 1:
                        return dummy_col.replace(f"{col}_", "")
                return None
            
            df_decoded[col] = df_decoded.apply(decode_row, axis=1)
            df_decoded = df_decoded.drop(columns=dummy_cols)

    else:
        raise ValueError(f"알 수 없는 encoding_type: {encoding_type}")

    return df_decoded           # dictionary 형태 categories 를 받아서, 해당 데이터를 기반으로 디코딩 후, 테이블 반환

In [12]:
categories={}

encoded = category_encoding(df, categories=categories, encoding='label')

In [13]:
display(encoded)

Unnamed: 0,Patient ID,Age recode with <1 year olds and 90+,Sex,Year of diagnosis,Year of follow-up recode,"Race recode (W, B, AI, API)",Site recode ICD-O-3/WHO 2008,Primary Site,Primary Site - labeled,Derived Summary Grade 2018 (2018+),...,COD to site recode,Sequence number,Median household income inflation adj to 2023,Number of Cores Positive Recode (2010+),Number of Cores Examined Recode (2010+),EOD Primary Tumor Recode (2018+),PRCDA 2020,Survival months flag,Survival months,Vital status recode (study cutoff used)
0,671,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0059,0
1,812,0,1,1,0,0,1,1,1,1,...,0,1,0,0,0,1,0,0,0007,0
2,878,1,0,2,0,0,2,2,2,2,...,0,0,1,0,0,2,0,0,0218,0
3,1018,2,1,3,0,0,3,3,3,2,...,0,2,2,0,0,2,0,0,0134,0
4,1269,3,1,4,0,0,4,4,4,2,...,0,1,2,0,0,2,0,0,0187,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528398,63452555,7,1,1,0,0,0,10,10,3,...,2,1,12,0,0,3,0,0,0002,1
528399,63452558,2,0,1,0,2,0,10,10,3,...,2,1,13,0,0,3,0,0,0002,1
528400,63452564,7,0,1,0,0,11,14,14,3,...,5,1,7,0,0,3,0,0,0002,1
528401,63452565,11,1,1,0,0,11,14,14,3,...,5,1,3,0,0,3,0,0,0000,1


In [14]:
print(categories)

{'encoding_type': 'label', 'Age recode with <1 year olds and 90+': {'65-69 years': 0, '60-64 years': 1, '70-74 years': 2, '40-44 years': 3, '45-49 years': 4, '85-89 years': 5, '55-59 years': 6, '75-79 years': 7, '30-34 years': 8, '80-84 years': 9, '50-54 years': 10, '90+ years': 11, '35-39 years': 12, '20-24 years': 13, '01-04 years': 14, '10-14 years': 15, '15-19 years': 16, '25-29 years': 17, '00 years': 18, '05-09 years': 19}, 'Sex': {'Male': 0, 'Female': 1}, 'Year of diagnosis': {np.int64(2018): 0, np.int64(2022): 1, np.int64(2004): 2, np.int64(2011): 3, np.int64(2007): 4, np.int64(2020): 5, np.int64(2019): 6, np.int64(2021): 7, np.int64(2016): 8, np.int64(2010): 9, np.int64(2015): 10, np.int64(2003): 11, np.int64(2012): 12, np.int64(2005): 13, np.int64(2013): 14, np.int64(2017): 15, np.int64(2002): 16, np.int64(2009): 17, np.int64(2006): 18, np.int64(2014): 19, np.int64(2000): 20, np.int64(2001): 21, np.int64(2008): 22}, 'Year of follow-up recode': {np.int64(2022): 0}, 'Race recod

---

### 모델 분석