In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel(r"C:\Users\User\Downloads\parsed_log_data.xlsx")
df.head()

Unnamed: 0,IP,Date_Time,Request_Type,API,Status_Code,Byte,Referrer,UA_String,Response_Time
0,233.223.117.90,27/Dec/2037:12:00:00 +0530,DELETE,/usr/admin,502,4963,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,45
1,162.253.4.179,27/Dec/2037:12:00:00 +0530,GET,/usr/admin/developer,200,5041,http://www.parker-miller.org/tag/list/list/pri...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,3885
2,252.156.232.172,27/Dec/2037:12:00:00 +0530,POST,/usr/register,404,5028,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,3350
3,182.215.249.159,27/Dec/2037:12:00:00 +0530,PUT,/usr/register,304,4936,http://www.parker-miller.org/tag/list/list/pri...,Mozilla/5.0 (Android 10; Mobile; rv:84.0) Geck...,767
4,160.36.208.51,27/Dec/2037:12:00:00 +0530,POST,/usr,304,4979,http://www.parker-miller.org/tag/list/list/pri...,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,84


In [3]:
df.isna().sum()

IP               0
Date_Time        0
Request_Type     0
API              0
Status_Code      0
Byte             0
Referrer         0
UA_String        0
Response_Time    0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407327 entries, 0 to 407326
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   IP             407327 non-null  object
 1   Date_Time      407327 non-null  object
 2   Request_Type   407327 non-null  object
 3   API            407327 non-null  object
 4   Status_Code    407327 non-null  int64 
 5   Byte           407327 non-null  int64 
 6   Referrer       407327 non-null  object
 7   UA_String      407327 non-null  object
 8   Response_Time  407327 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 28.0+ MB


In [5]:
df.nunique()

IP               407299
Date_Time             1
Request_Type          4
API                   5
Status_Code           7
Byte                423
Referrer              2
UA_String            10
Response_Time      5000
dtype: int64

In [6]:
df.describe()

Unnamed: 0,Status_Code,Byte,Response_Time
count,407327.0,407327.0,407327.0
mean,373.753181,4999.465503,2499.562455
std,103.039086,49.963237,1440.750265
min,200.0,4764.0,1.0
25%,303.0,4966.0,1255.0
50%,403.0,5000.0,2502.0
75%,500.0,5033.0,3745.0
max,502.0,5233.0,5000.0


Common criteria for identifying suspicious activities in web server logs might include:

Unusually high frequency of requests from a single IP address.
Requests to APIs that are not commonly accessed.
Abnormally high or low byte sizes.
Uncommon referrers or user agents.
High response times.
Unusual status codes (e.g., a large number of 500-series errors).

In [7]:
datetime_format = '%d/%b/%Y:%H:%M:%S %z'
df['Date_Time'] = pd.to_datetime(df['Date_Time'], format=datetime_format, errors='coerce')

Date_Time
2037-12-27 12:00:00+05:30    407327
Name: count, dtype: int64

In [8]:
df['request_count'] = df.groupby('IP')['IP'].transform('count')
df['request_count'].value_counts()

request_count
1    407271
2        56
Name: count, dtype: int64

In [9]:
api_counts = df['API'].value_counts()
df['api_frequency'] = df['API'].map(api_counts)


In [10]:

response_time_threshold = df['Response_Time'].quantile(0.99)
df['high_response_time'] = df['Response_Time'] > response_time_threshold
byte_99th = df['Byte'].quantile(0.95)
df['abnormal_byte'] = df['Byte'] > byte_99th
error_status_codes = [400, 401, 403, 404, 500, 502, 503, 504]
df['error_status_code'] = df['Status_Code'].isin(error_status_codes)

In [11]:
common_referrers = df['Referrer'].value_counts().index[:10]  # Top 10 referrers
common_user_agents = df['UA_String'].value_counts().index[:10]  # Top 10 user agents

df['uncommon_referrer'] = ~df['Referrer'].isin(common_referrers)
df['uncommon_user_agent'] = ~df['UA_String'].isin(common_user_agents)

In [12]:
# Suspcious activities by response time and bytes
# df['suspicious'] = 0
# df.loc[(df['Response_Time'] > response_time_threshold) | (df['Byte'] > byte_99th), 'suspicious'] = 1

# df['suspicious'].value_counts()

In [13]:
# using STD
# Calculate the mean and standard deviation of the response time
# response_time_mean = df['Response_Time'].mean()
# response_time_std = df['Response_Time'].std()

# # Set the threshold as mean + 2 standard deviations
# response_time_threshold = response_time_mean + 2 * response_time_std
# df['suspicious'] = 0
# df.loc[df['Response_Time'] > response_time_threshold, 'suspicious'] = 1

# (df['suspicious'].value_counts())

In [14]:
# print(f"High request count: {(df['request_count'] > response_time_threshold).sum()}")
# print(f"Low API frequency: {(df['api_frequency'] < df['api_frequency'].quantile(0.01)).sum()}")
# print(f"Abnormal byte size: {df['abnormal_byte'].sum()}")
# print(f"Uncommon referrer: {df['uncommon_referrer'].sum()}")
# print(f"Uncommon user agent: {df['uncommon_user_agent'].sum()}")
# print(f"High response time: {df['high_response_time'].sum()}")
# print(f"Error status code: {df['error_status_code'].sum()}")

In [15]:
df['suspicious'] = (
    (df['request_count'] > df['request_count'].quantile(0.99)) &
    (df['api_frequency'] < df['api_frequency'].quantile(0.01)) |
    
    (df['uncommon_referrer']) &
    (df['uncommon_user_agent']) |
    (df['abnormal_byte']) & (df['high_response_time']) |
    (df['error_status_code']) & (df['high_response_time']) 
).astype(int)
df['suspicious'].value_counts()

suspicious
0    404872
1      2455
Name: count, dtype: int64

## Data Training and Model creation

### Random Forest Model creation

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [17]:
# Select features and target
X = df.drop(['suspicious', 'Date_Time'], axis=1)
y = df['suspicious']

In [18]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [20]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, n_jobs=-1))])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
classification_report(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     80938
           1       0.00      0.00      0.00       528

    accuracy                           0.99     81466
   macro avg       0.50      0.50      0.50     81466
weighted avg       0.99      0.99      0.99     81466



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Balancing Unbalanced Code

In [25]:
# Select features and target
X = df.drop(['suspicious', 'Date_Time'], axis=1)
y = df['suspicious']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical features with OneHotEncoder
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit and transform the training data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

# Create a RandomForest classifier pipeline
pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, class_weight='balanced', n_jobs=-1))
])

# Train the model with the resampled data
pipeline.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     80938
           1       0.74      0.99      0.85       528

    accuracy                           1.00     81466
   macro avg       0.87      1.00      0.92     81466
weighted avg       1.00      1.00      1.00     81466



### XGBOOST ALGORITHM


In [27]:
import xgboost as xgb

In [30]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

# Create an XGBoost classifier pipeline
pipeline = Pipeline(steps=[
    ('classifier', xgb.XGBClassifier(random_state=42, scale_pos_weight=(y_train_resampled.value_counts()[0] / y_train_resampled.value_counts()[1])))
])

# Train the model with the resampled data
pipeline.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test_encoded)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     80938
           1       0.99      0.98      0.99       528

    accuracy                           1.00     81466
   macro avg       1.00      0.99      0.99     81466
weighted avg       1.00      1.00      1.00     81466

