In [None]:
# ================================ final_cleaned.csv ===================================================

In [1]:
# ✅ 1. 필수 라이브러리 설치 (Colab 첫 실행 시)
!pip install lightgbm scikit-learn joblib



In [2]:
# ✅ 2. 라이브러리 임포트
import pandas as pd
import numpy as np
import re
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import os

In [3]:
# ✅ 3. 데이터 불러오기
df = pd.read_csv('final_cleaned.csv')

In [4]:
# ✅ 4. 기본 피처 생성
df['url_length'] = df['url'].apply(lambda x: len(str(x)))
df['count_dots'] = df['url'].apply(lambda x: str(x).count('.'))
df['count_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in str(x)))
df['count_specials'] = df['url'].apply(lambda x: len(re.findall(r'[-_%=]', str(x))))

In [5]:
# ✅ 5. Shannon Entropy 계산 함수
def shannon_entropy(s):
    prob = [freq / len(s) for freq in Counter(s).values()]
    return -sum(p * math.log2(p) for p in prob) if len(s) > 0 else 0

df['url_entropy'] = df['url'].apply(shannon_entropy)

In [6]:
# ✅ 6. HTML 응답 여부 판단
df['url_available'] = (df['url_length'] > df['url_length'].quantile(0.1)).astype(int)

In [7]:
# ✅ 7. HTML 응답 가능한 URL만 필터링
df = df[df['url_available'] == 1].copy()

In [8]:
# ✅ 8. HTML 관련 피처 생성
df['html_entropy'] = df['url_entropy']
df['html_special_ratio'] = df['count_specials'] / df['url_length']
df['html_upper_ratio'] = df['url'].apply(lambda x: sum(c.isupper() for c in str(x)) / len(str(x)))

In [9]:
# ✅ 9. file_ext 인코딩
df['file_ext'] = df['url'].str.extract(r'\.([a-zA-Z0-9]+)$').fillna('none')
le = LabelEncoder()
df['file_ext'] = le.fit_transform(df['file_ext'])

In [10]:
# ✅ 10. 인코더 저장
os.makedirs('model', exist_ok=True)
joblib.dump(le, 'model/encoder.pkl')

['model/encoder.pkl']

In [11]:
# ✅ 11. 피처/레이블 정의
features = [
    'url_length', 'count_dots', 'count_digits', 'count_specials', 'file_ext',
    'url_entropy', 'html_entropy', 'html_special_ratio', 'html_upper_ratio'
]
X = df[features]
y = df['label']

In [12]:
# ✅ 12. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [13]:

# ✅ 13. LightGBM 모델 정의 및 학습
model = LGBMClassifier(
    num_leaves=64,
    max_depth=10,
    learning_rate=0.05,
    n_estimators=500,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 446459, number of used features: 9
[LightGBM] [Info] Start training from score -0.743927
[LightGBM] [Info] Start training from score -1.672223
[LightGBM] [Info] Start training from score -1.087890


In [14]:
# ✅ 14. 평가
y_pred = model.predict(X_test)

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📈 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy: {accuracy:.4f}")

📊 Confusion Matrix:
[[46836    76  6133]
 [  368 19807   789]
 [ 6529   435 30642]]

📈 Classification Report:
              precision    recall  f1-score   support

         0.0     0.8716    0.8829    0.8773     53045
         1.0     0.9748    0.9448    0.9596     20964
         2.0     0.8157    0.8148    0.8153     37606

    accuracy                         0.8716    111615
   macro avg     0.8874    0.8809    0.8840    111615
weighted avg     0.8722    0.8716    0.8718    111615


✅ Accuracy: 0.8716


In [None]:
# ================================ real_final.csv===================================================

In [15]:
# ✅ 1. 필수 라이브러리 설치 (Colab 첫 실행 시)
!pip install lightgbm scikit-learn joblib



In [16]:
# ✅ 2. 라이브러리 임포트
import pandas as pd
import numpy as np
import re
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import os

In [17]:
# ✅ 3. 데이터 불러오기
df = pd.read_csv('real_final.csv')

  df = pd.read_csv('real_final.csv')


In [18]:
# ✅ 4. 기본 피처 생성
df['url_length'] = df['url'].apply(lambda x: len(str(x)))
df['count_dots'] = df['url'].apply(lambda x: str(x).count('.'))
df['count_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in str(x)))
df['count_specials'] = df['url'].apply(lambda x: len(re.findall(r'[-_%=]', str(x))))

In [20]:
# ✅ 5. Shannon Entropy 계산 함수
def shannon_entropy(s):
    s = str(s) if pd.notnull(s) else ''
    prob = [freq / len(s) for freq in Counter(s).values()]
    return -sum(p * math.log2(p) for p in prob) if len(s) > 0 else 0


In [21]:
# ✅ 6. HTML 응답 여부 판단
df['url_available'] = (df['url_length'] > df['url_length'].quantile(0.1)).astype(int)

In [22]:
# ✅ 7. HTML 응답 가능한 URL만 필터링
df = df[df['url_available'] == 1].copy()

In [23]:
# ✅ 8. HTML 관련 피처 생성
df['html_entropy'] = df['url_entropy']
df['html_special_ratio'] = df['count_specials'] / df['url_length']
df['html_upper_ratio'] = df['url'].apply(lambda x: sum(c.isupper() for c in str(x)) / len(str(x)))

In [24]:
# ✅ 9. file_ext 인코딩
df['file_ext'] = df['url'].str.extract(r'\.([a-zA-Z0-9]+)$').fillna('none')
le = LabelEncoder()
df['file_ext'] = le.fit_transform(df['file_ext'])

In [25]:
# ✅ 10. 인코더 저장
os.makedirs('model', exist_ok=True)
joblib.dump(le, 'model/encoder.pkl')

['model/encoder.pkl']

In [26]:
# ✅ 11. 피처/레이블 정의
features = [
    'url_length', 'count_dots', 'count_digits', 'count_specials', 'file_ext',
    'url_entropy', 'html_entropy', 'html_special_ratio', 'html_upper_ratio'
]
X = df[features]
y = df['label']

In [27]:
# ✅ 12. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [28]:

# ✅ 13. LightGBM 모델 정의 및 학습
model = LGBMClassifier(
    num_leaves=64,
    max_depth=10,
    learning_rate=0.05,
    n_estimators=500,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1709
[LightGBM] [Info] Number of data points in the train set: 490756, number of used features: 9
[LightGBM] [Info] Start training from score -0.658601
[LightGBM] [Info] Start training from score -1.765678
[LightGBM] [Info] Start training from score -1.166824


In [29]:
# ✅ 14. 평가
y_pred = model.predict(X_test)

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📈 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy: {accuracy:.4f}")

📊 Confusion Matrix:
[[57267    59  6174]
 [  447 19771   771]
 [ 7133   462 30605]]

📈 Classification Report:
              precision    recall  f1-score   support

         0.0     0.8831    0.9018    0.8924     63500
         1.0     0.9743    0.9420    0.9579     20989
         2.0     0.8150    0.8012    0.8081     38200

    accuracy                         0.8774    122689
   macro avg     0.8908    0.8817    0.8861    122689
weighted avg     0.8775    0.8774    0.8773    122689


✅ Accuracy: 0.8774
