# Bot Detection ML Pipeline

This notebook analyzes server logs to detect bot traffic patterns and trains a machine learning model to distinguish bots from humans.

In [None]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load sample log file
log_path = 'data/sample_log.txt'
if not os.path.exists(log_path):
    print("Sample log not found, generating synthetic dataset...")
    data = pd.DataFrame({
        'ip': np.random.choice(['35.185.0.156', '45.133.1.23', '194.168.1.45', '185.220.1.10', '66.249.66.1'], 1000),
        'method': np.random.choice(['GET','POST','PUT','DELETE','PATCH'], 1000, p=[0.6,0.2,0.05,0.05,0.1]),
        'status': np.random.choice([200,404,429,500], 1000, p=[0.7,0.15,0.1,0.05]),
        'user_agent': np.random.choice(['Mozilla/5.0','python-requests','curl/7.68','Googlebot','Safari/537.36'], 1000),
        'endpoint': np.random.choice(['/','/api','/login','/admin','/search'], 1000),
        'timestamp': pd.date_range(start='2023-01-01', periods=1000, freq='min')
    })
else:
    # Basic regex parsing for Apache-like logs (customize as needed)
    rows = []
    with open(log_path, 'r') as f:
        for line in f:
            match = re.match(r'(\S+) - - \[(.*?)\] \"(\S+) (.*?) HTTP/\d\.\d\" (\d+) .*? \"(.*?)\"', line)
            if match:
                ip, ts, method, endpoint, status, agent = match.groups()
                rows.append([ip, method, int(status), agent, endpoint, ts])
    data = pd.DataFrame(rows, columns=['ip','method','status','user_agent','endpoint','timestamp'])

print("Loaded data shape:", data.shape)
data.head()

In [None]:
# Feature engineering
features = data.groupby('ip').agg({
    'method': lambda x: len(set(x)),
    'status': lambda x: (x==404).sum(),
    'endpoint': 'nunique',
    'user_agent': 'nunique'
}).rename(columns={'method':'unique_methods','status':'404_errors','endpoint':'unique_endpoints','user_agent':'unique_agents'})

# Request count
features['request_count'] = data.groupby('ip')['endpoint'].count()

# Heuristic labeling: bots if high errors or requests
features['label'] = np.where((features['request_count']>200) | (features['404_errors']>50) | (features['unique_agents']==1),1,0)

features.head()

In [None]:
# Train/test split
X = features.drop('label', axis=1)
y = features['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
# Feature importance
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(8,6))
plt.title("Feature Importance for Bot Detection")
plt.show()