# 웹 서버 로그 기반 악성 요청 탐지를 위한 분류 모델 구축

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
# 데이터 불러오기
df = pd.read_csv("web_server_logs_2.csv")
df.head()

Unnamed: 0,ip,timestamp,method,status_code,size,label
0,192.168.1.138,2024-11-30 09:26:01,OPTIONS,301,541,0
1,192.168.1.130,2024-11-30 12:00:34,OPTIONS,200,5239,0
2,192.168.1.75,2024-12-01 00:22:12,POST,200,5778,0
3,192.168.1.176,2024-11-30 19:33:26,DELETE,200,3911,0
4,10.0.0.113,2024-11-30 05:37:26,GET,200,7765,0


In [3]:
# 전처리

# timestamp 컬럼에서 hour 추출하여 컬럼 생성
df['hour'] = pd.to_datetime(df['timestamp']).dt.hour

# status_code 컬럼에서 is_error 컬럼 생성
df['is_error'] = (df['status_code'] >= 400).astype(int) # status_code가 400 이상이면 에러로 간주
df.head()

Unnamed: 0,ip,timestamp,method,status_code,size,label,hour,is_error
0,192.168.1.138,2024-11-30 09:26:01,OPTIONS,301,541,0,9,0
1,192.168.1.130,2024-11-30 12:00:34,OPTIONS,200,5239,0,12,0
2,192.168.1.75,2024-12-01 00:22:12,POST,200,5778,0,0,0
3,192.168.1.176,2024-11-30 19:33:26,DELETE,200,3911,0,19,0
4,10.0.0.113,2024-11-30 05:37:26,GET,200,7765,0,5,0


In [4]:
# 특성 생성
features = ['hour', 'status_code', 'size', 'is_error']

# 특성과 레이블 분리
X = df[features]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Logistic Regression 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
# 모델 평가
accuracy = accuracy_score(y_test, y_pred) # 정확도
precision = precision_score(y_test, y_pred)  # 정밀도
recall = recall_score(y_test, y_pred)  # 재현율
f1 = f1_score(y_test, y_pred)  # F1-Score

print("정확도(Accuracy):", accuracy) # 정확도 출력
print("정밀도(Precision):", precision) # 정밀도 출력
print("재현율(Recall):", recall) # 재현율 출력
print("F1_Score:", f1) # F1-Score 출력

print("\n분류 보고서 (Classification Report):\n", 
classification_report(y_test, y_pred))

정확도(Accuracy): 0.8933333333333333
정밀도(Precision): 0.8181818181818182
재현율(Recall): 0.8526315789473684
F1_Score: 0.8350515463917526

분류 보고서 (Classification Report):
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       205
           1       0.82      0.85      0.84        95

    accuracy                           0.89       300
   macro avg       0.87      0.88      0.88       300
weighted avg       0.89      0.89      0.89       300

