In [None]:
import pandas as pd

# --- 1. Load all datasets ---
logs = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_labelled.csv')
ip_entropy = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/ip_entropy_features.csv')
geolocation = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/geolocation_enriched.csv')
session_stats = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/session_stats.csv')
# (Optional) Load time-based and sequence features if you wish to merge them:
time_based = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/time_based_features.csv')
sequence_patterns = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/sequence_patterns_features.csv')

# --- 2. Merge IP-based features ---
# Add IP entropy metrics
logs = logs.merge(ip_entropy, on='ip', how='left')

# Add geolocation features (pick only unique IP info, drop duplicates for safe join)
geo_cols = ['ip', 'country_x', 'region', 'city', 'asn', 'reverse_dns']
geo_info = geolocation[geo_cols].drop_duplicates('ip')
logs = logs.merge(geo_info, on='ip', how='left')

# --- 3. Merge Session-level features ---
# Take the most recent session per IP (can change to aggregate if you have session_id per row)
session_latest = session_stats.sort_values('end_time').groupby('ip').tail(1)
session_cols = [col for col in session_stats.columns if col not in ['ip', 'start_time', 'end_time']]
logs = logs.merge(session_latest[['ip'] + session_cols], on='ip', how='left')

# --- 4. (Optional) Merge Time-based/Sequence features if needed ---
# If you want to add time-based, you'll need to round/align timestamps for a good join (not shown here)
# logs = logs.merge(time_based, ...)

# --- 5. Save unified DataFrame for analysis and modeling ---
logs.to_csv('logs_features_unified.csv', index=False)

print("Unified feature set shape:", logs.shape)
print(logs.head())


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your merged dataset
logs = pd.read_csv('/content/logs_features_unified.csv')

# Select only numeric columns for correlation
numeric_cols = logs.select_dtypes(include=['number']).columns
corr_matrix = logs[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features (Merged Logs)')
plt.tight_layout()
plt.show()

In [None]:
time_based['datetime'] = pd.to_datetime(time_based['datetime'])
time_based.set_index('datetime', inplace=True)

time_based['request_count_mean_1h'].plot(figsize=(15, 5), title='Request Rate per Hour - Detecting Spikes')
plt.ylabel('Request Count Mean (1h)')
plt.show()


In [None]:
logs.columns