In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

# 데이터 로드
data = pd.read_csv('colon_cancer_data.csv')

# 데이터의 첫 몇 행을 출력하여 확인print(data.head())

# 'recurrence'를 목표 변수로 가정하고 나머지는 특징으로 설정
X = data.drop('recurrence', axis=1)
y = data['recurrence']

# 수치형 변수와 범주형 변수 구분
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 수치형 변수를 위한 전처리 파이프라인 (결측값 대체 및 표준화)
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 범주형 변수를 위한 전처리 파이프라인 (결측값 대체 및 원-핫 인코딩)
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 수치형 및 범주형 파이프라인을 결합
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# 전처리와 분류기를 결합한 파이프라인 생성
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 데이터를 학습용과 테스트용으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model.fit(X_train, y_train)

# 예측 수행
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'정확도: {accuracy}')
print('혼동 행렬:')
print(conf_matrix)
print('분류 보고서:')
print(class_report)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow import keras

# data(.csv) load

In [None]:
# case1 : sheet 1개일 경우 (1명의 환자의 변수가 sheet하나에 다 들어있음)

df = pd.read_csv("C:/Users/nkm11/Desktop/AI_class/Gil/K-CURE대장암_샘플.csv", index_col=0) # encoding 오류시, csv열어보셈. 
# df = pd.read_csv("./경로명작성/파일명.csv", index_col=0)

In [None]:
# case2 : sheet 2개 이상일 경우


In [None]:
# case3 : 그럴 일은 없겠으나, txt에 작성된 경우
file_path = "./파일명.txt"
column_names = ["변수1 ", "변수2 "]          
df = pd.read_csv(file_path, header=None, names=column_names, delimiter='\s+')

In [None]:
# 확인
df
#print(df.head)

# data info 확인

In [None]:
# 컬렴명 확인
df.columns

In [None]:
# 박스 플롯 그리기
plt.figure(figsize=(20, 16))
for i, column in enumerate(df.columns):
    plt.subplot(3, 4, i+1)
    plt.boxplot(df[column])
    plt.title(column)

plt.tight_layout()
plt.show()

In [None]:
# 레이블 갯수 확인 (그래프)
sns.countplot(data = df, x="label")
plt.xlabel("Cancer or Not")
plt.ylabel("Count")
plt.title("Label")

In [None]:
# 레이블 갯수 확인 (숫자)
df['변수명'].value_counts()

In [None]:
#결측치 확인
df.isnull.sum()

In [None]:
# 결측치 제거
df = df.dropna()
df.isnull().sum()

# standardscaler

In [None]:
# Standardization(표준화)
scaler = StandardScaler()
scaled_data_standard = scaler.fit_transform(df.drop('변수', axis=1))  # rings 열 제외하고 표준화
scaled_df_standard = pd.DataFrame(scaled_data_standard, columns=df.columns[:-1])  # 데이터프레임으로 변환
scaled_df_standard

In [None]:
x=scaled_df_standard

X = x.values
X

# Prediction model

In [None]:
import tensorflow as tf

from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD,Adam
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv1D
from keras.layers import Dense, Flatten, Dropout
from sklearn.model_selection import train_test_split

In [None]:
# 데이터를 학습용과 테스트용으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.svm import SVC

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

pred_svm = clf_svm.predict(X_test)

print("\n--- SVM Classifier ---")
print(accuracy_score(y_test, pred_svm))
print(confusion_matrix(y_test, pred_svm))

In [None]:
# Build a logistic regression classifier and predict

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

pred_lr = clf_lr.predict(X_test)

print ("\n--- Logistic Regression Classifier ---")
print (accuracy_score(y_test, pred_lr))
print (confusion_matrix(y_test, pred_lr))

In [None]:
# Build a decision tree classifier and predict

clf_dt = DecisionTreeClassifier(random_state=0)
clf_dt.fit(X_train, y_train)

pred_dt = clf_dt.predict(X_test)

print ("\n--- Decision Tree Classifier ---")
print (accuracy_score(y_test, pred_dt))
print (confusion_matrix(y_test, pred_dt))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print ("\n--- Radom Forest ---")
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
print(accuracy_score(y_test,pred))
print (confusion_matrix(y_test, pred))

In [None]:
clf_nn = MLPClassifier(random_state=0)
clf_nn.fit(X_train, y_train)

pred_nn = clf_nn.predict(X_test)

print ("\n--- Neural Network Classifier ---")
print (accuracy_score(y_test, pred_nn))
print (confusion_matrix(y_test, pred_nn))