In [2]:
print('Hello')

Hello


In [4]:
import pandas as pd

path = '/Users/whovi/OneDrive/Documents/mon.csv'
df = pd.read_csv(path)

print(df.head)

<bound method NDFrame.head of        Total Packets  Incoming Packets  Outgoing Packets  Incoming Ratio  \
0             1421.0             121.0            1300.0        0.085151   
1              518.0              80.0             438.0        0.154440   
2             1358.0             118.0            1240.0        0.086892   
3             1446.0             122.0            1324.0        0.084371   
4             1406.0             115.0            1291.0        0.081792   
...              ...               ...               ...             ...   
18995         9434.0             619.0            8815.0        0.065614   
18996         9956.0             552.0            9404.0        0.055444   
18997         9952.0             579.0            9373.0        0.058179   
18998         9926.0             690.0            9236.0        0.069514   
18999         9925.0             757.0            9168.0        0.076272   

       Outgoing Ratio  Outgoing Std  Outgoing Mean  Packe

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# 특징과 라벨 분리
X = df.drop(columns=['Label'])
y = df['Label']

# 데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 모델 초기화 및 학습
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# 검증 데이터 예측 및 정확도 측정
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# 테스트 데이터 예측 및 정확도 측정
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Validation Accuracy: 0.6621
Test Accuracy: 0.6611


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint

# 탐색할 하이퍼파라미터 범위 설정
param_distributions = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# RandomizedSearchCV 설정 
random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=10,  # 시도할 조합 수 설정
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 학습 및 결과 출력
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Validation Accuracy:", random_search.best_score_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 200}
Best Validation Accuracy: 0.6875197035614339


## 중요도 높은 순서 뽑아서 진행

In [11]:
# 특성 중요도 확인
importances = random_search.best_estimator_.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
print(feature_importance_df)

               Feature  Importance
6        Outgoing Mean    0.130847
0        Total Packets    0.106775
5         Outgoing Std    0.106460
14     Alternative Sum    0.092265
1     Incoming Packets    0.092206
13   Concentration Std    0.083499
11   Inter-arrival Std    0.073148
2     Outgoing Packets    0.055949
8    First 30 Incoming    0.041715
9    First 30 Outgoing    0.041237
12  Concentration Mean    0.038721
4       Outgoing Ratio    0.037895
7   Packets per Second    0.036640
10  Inter-arrival Mean    0.035309
3       Incoming Ratio    0.027334


In [14]:
# 중요도가 0.05 이상인 특성만 선택
important_features = feature_importance_df[feature_importance_df['Importance'] >= 0.04]['Feature']
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

params_distributions = {
    'n_estimators' : [100,200,300],
    'learning_rate' : [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=10,  # 시도할 조합 수 설정
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 하이퍼파라미터 튜닝 수행
random_search.fit(X_train_selected, y_train)

# 최적의 하이퍼파라미터 및 성능 출력
print("Best Parameters with Selected Features:", random_search.best_params_)
print("Best Validation Accuracy with Selected Features:", random_search.best_score_)

# 최적의 모델로 테스트 정확도 확인
best_model = random_search.best_estimator_
y_test_pred = best_model.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with Best Parameters and Selected Features: {test_accuracy:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters with Selected Features: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Validation Accuracy with Selected Features: 0.6895394709965291
Test Accuracy with Best Parameters and Selected Features: 0.7105


## Randomforest Basic

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 특징과 라벨 분리
X = df.drop(columns=['Label'])
y = df['Label']

# 데이터 분할 (train과 test로 80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForest 모델 초기화 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 예측 및 정확도 측정
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7739


## n_iter를 늘리면 정확도 올라가는 추이

In [19]:
# 탐색할 하이퍼파라미터 범위 설정
param_distributions = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# RandomizedSearchCV 설정
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=30,  # 시도할 조합 수 설정
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 학습
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 검증 점수 출력
print("Best Parameters:", random_search.best_params_)
print("Best Validation Accuracy:", random_search.best_score_)

# 최적의 모델로 테스트 정확도 확인
best_model = random_search.best_estimator_
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with Best Parameters: {test_accuracy:.4f}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 200}
Best Validation Accuracy: 0.7540787374695594
Test Accuracy with Best Parameters: 0.7595
