<a href="https://colab.research.google.com/github/cindyshin2211/Website_Fingerprinting_MLB/blob/%EC%8B%A0%EC%84%B1%ED%98%84/DecisionTree_RandomForest(monunmon)_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Decision Tree

In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier

# 데이터 로드
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/EWHA_machine_ learning/monunmon.csv")  # CSV 파일 경로

# 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

# 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화 (Logistic Regression은 정규화된 데이터에서 더 잘 작동)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Decision Tree 모델 생성 및 학습
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64
Validation Accuracy: 0.87
Validation F1 Score: 0.92
Test Accuracy: 0.87
Test F1 Score: 0.92

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.52      0.49      0.50       450
           1       0.92      0.93      0.92      2850

    accuracy                           0.87      3300
   macro avg       0.72      0.71      0.71      3300
weighted avg       0.86      0.87      0.87      3300



In [4]:
#필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 데이터 로드
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/EWHA_machine_ learning/monunmon.csv")  # 데이터셋 경로 입력
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)  # 이진 레이블 변환

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### 방법 1: 클래스 가중치 적용 ###
print("=== DecisionTreeClassifier with Class Weights ===")
model_weighted =DecisionTreeClassifier(random_state=42, class_weight='balanced')
model_weighted.fit(X_train, y_train)

# 테스트 데이터 평가
y_test_pred = model_weighted.predict(X_test)
print("\nClassification Report on Test Data (Class Weights):")
print(classification_report(y_test, y_test_pred))

### 방법 2: SMOTE를 이용한 오버샘플링 ###
print("\n=== DecisionTreeClassifier with SMOTE ===")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Resampled Training Set Size (SMOTE):", X_train_smote.shape)

model_smote =DecisionTreeClassifier(random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

# 테스트 데이터 평가
y_test_pred_smote = model_smote.predict(X_test)
print("\nClassification Report on Test Data (SMOTE):")
print(classification_report(y_test, y_test_pred_smote))

### 방법 3: 언더샘플링 ###
print("\n=== DecisionTreeClassifier with Undersampling ===")
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

print("Resampled Training Set Size (Undersampling):", X_train_under.shape)

model_under = DecisionTreeClassifier(random_state=42)
model_under.fit(X_train_under, y_train_under)

# 테스트 데이터 평가
y_test_pred_under = model_under.predict(X_test)
print("\nClassification Report on Test Data (Undersampling):")
print(classification_report(y_test, y_test_pred_under))

=== DecisionTreeClassifier with Class Weights ===

Classification Report on Test Data (Class Weights):
              precision    recall  f1-score   support

           0       0.51      0.45      0.48       450
           1       0.91      0.93      0.92      2850

    accuracy                           0.87      3300
   macro avg       0.71      0.69      0.70      3300
weighted avg       0.86      0.87      0.86      3300


=== DecisionTreeClassifier with SMOTE ===
Resampled Training Set Size (SMOTE): (26600, 15)

Classification Report on Test Data (SMOTE):
              precision    recall  f1-score   support

           0       0.42      0.57      0.48       450
           1       0.93      0.88      0.90      2850

    accuracy                           0.83      3300
   macro avg       0.67      0.72      0.69      3300
weighted avg       0.86      0.83      0.84      3300


=== DecisionTreeClassifier with Undersampling ===
Resampled Training Set Size (Undersampling): (4200, 15)

# Random Forest

In [5]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

# 데이터 로드
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/EWHA_machine_ learning/monunmon.csv")  # CSV 파일 경로

# 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

# 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화 (Logistic Regression은 정규화된 데이터에서 더 잘 작동)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Random Forest 모델 생성 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64
Validation Accuracy: 0.91
Validation F1 Score: 0.95
Test Accuracy: 0.91
Test F1 Score: 0.95

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.85      0.38      0.52       450
           1       0.91      0.99      0.95      2850

    accuracy                           0.91      3300
   macro avg       0.88      0.68      0.74      3300
weighted avg       0.90      0.91      0.89      3300



In [6]:
#필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 데이터 로드
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/EWHA_machine_ learning/monunmon.csv")  # 데이터셋 경로 입력
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)  # 이진 레이블 변환

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### 방법 1: 클래스 가중치 적용 ###
print("=== RandomForest with Class Weights ===")
model_weighted = RandomForestClassifier(random_state=42, class_weight="balanced")
model_weighted.fit(X_train, y_train)

# 테스트 데이터 평가
y_test_pred = model_weighted.predict(X_test)
print("\nClassification Report on Test Data (Class Weights):")
print(classification_report(y_test, y_test_pred))

### 방법 2: SMOTE를 이용한 오버샘플링 ###
print("\n=== RandomForest with SMOTE ===")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Resampled Training Set Size (SMOTE):", X_train_smote.shape)

model_smote = RandomForestClassifier(random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

# 테스트 데이터 평가
y_test_pred_smote = model_smote.predict(X_test)
print("\nClassification Report on Test Data (SMOTE):")
print(classification_report(y_test, y_test_pred_smote))

### 방법 3: 언더샘플링 ###
print("\n=== RandomForest with Undersampling ===")
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

print("Resampled Training Set Size (Undersampling):", X_train_under.shape)

model_under =  RandomForestClassifier(random_state=42)
model_under.fit(X_train_under, y_train_under)

# 테스트 데이터 평가
y_test_pred_under = model_under.predict(X_test)
print("\nClassification Report on Test Data (Undersampling):")
print(classification_report(y_test, y_test_pred_under))

=== RandomForest with Class Weights ===

Classification Report on Test Data (Class Weights):
              precision    recall  f1-score   support

           0       0.84      0.37      0.52       450
           1       0.91      0.99      0.95      2850

    accuracy                           0.90      3300
   macro avg       0.87      0.68      0.73      3300
weighted avg       0.90      0.90      0.89      3300


=== RandomForest with SMOTE ===
Resampled Training Set Size (SMOTE): (26600, 15)

Classification Report on Test Data (SMOTE):
              precision    recall  f1-score   support

           0       0.59      0.64      0.62       450
           1       0.94      0.93      0.94      2850

    accuracy                           0.89      3300
   macro avg       0.77      0.79      0.78      3300
weighted avg       0.89      0.89      0.89      3300


=== RandomForest with Undersampling ===
Resampled Training Set Size (Undersampling): (4200, 15)

Classification Report on Tes