In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# --------------------------
# STEP 1: Load and Preprocess Logs
# --------------------------
df = pd.read_csv("C:/Users/Vasu/Downloads/cleaned_proxy_logs - Copy.csv")

if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
else:
    df['timestamp'] = pd.to_datetime(df['@timestamp'])

df = df.rename(columns={
    'src_ip': 'source_ip',
    'domain': 'destination_domain'
})

df = df[['timestamp', 'source_ip', 'destination_domain', 'bytes_sent']]
df.dropna(inplace=True)
df['date'] = df['timestamp'].dt.floor('h')

# --------------------------
# STEP 2: Aggregate Traffic per IP
# --------------------------
traffic_df = df.groupby(['source_ip', 'date'])['bytes_sent'].sum().reset_index()
traffic_df.rename(columns={'bytes_sent': 'total_bytes'}, inplace=True)

# --------------------------
# STEP 3: Load Malicious Domains List
# --------------------------
malicious_domain_path = 'C:/Users/swaya/OneDrive/Desktop/PROXY_Jan_to_May 2025_Malicious_Domain.txt'
with open(malicious_domain_path, 'r') as f:
    malicious_domains = set(line.strip().lower() for line in f.readlines())

# --------------------------
# STEP 4: LSTM Anomaly Detection
# --------------------------
def create_dataset(dataset, look_back=24):
    X, y = [], []
    for i in range(len(dataset) - look_back):
        X.append(dataset[i:i+look_back])
        y.append(dataset[i+look_back])
    return np.array(X), np.array(y)

def train_lstm_on_ip(ip_df):
    ip_df = ip_df.set_index('date').resample('h').sum().fillna(0)
    if len(ip_df) < 25:
        return None, None, None, None, None

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(ip_df[['total_bytes']])

    look_back = 24
    X, y = create_dataset(scaled_data, look_back)
    if len(X) == 0:
        return None, None, None, None, None

    X = X.reshape((X.shape[0], X.shape[1], 1))

    model = Sequential()
    model.add(LSTM(50, input_shape=(look_back, 1)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X, y, epochs=15, batch_size=8, verbose=0)

    predicted = model.predict(X)
    predicted = scaler.inverse_transform(predicted)
    actual = scaler.inverse_transform(y.reshape(-1, 1))

    residuals = np.abs(actual - predicted)
    threshold = np.mean(residuals) + 1 * np.std(residuals)
    anomalies = residuals > threshold
    anomaly_indices = np.where(anomalies)[0]
    anomaly_timestamps = ip_df.index[look_back:][anomaly_indices]

    return anomalies.sum(), threshold, residuals, ip_df.reset_index(), anomaly_timestamps

# --------------------------
# STEP 5: Detect Suspicious IPs Based on Data Spike + Domain Check
# --------------------------
suspicious_summary = []
all_ips = traffic_df['source_ip'].unique()

print("\n[+] Suspicious IPs (data spike + known malicious domain):\n")
for ip in all_ips:
    ip_df = traffic_df[traffic_df['source_ip'] == ip]
    anomaly_count, threshold, residuals, full_series, anomaly_times = train_lstm_on_ip(ip_df)

    if anomaly_count is None or anomaly_count == 0:
        continue

    # Domains accessed by IP during anomaly time
    relevant_logs = df[(df['source_ip'] == ip) & (df['date'].isin(anomaly_times))]
    contacted_domains = set(relevant_logs['destination_domain'].str.lower().unique())
    matched_malicious = contacted_domains.intersection(malicious_domains)

    if matched_malicious:
        suspicious_summary.append({
            "IP": ip,
            "Anomaly_Timestamps": ', '.join(anomaly_times.astype(str)),
            "Matched_Domains": ', '.join(matched_malicious),
            "Reason": "data spike + domain"
        })

        print(f"IP: {ip}")
        print(f"  Anomaly Timestamps: {anomaly_times.tolist()}")
        print(f"  Matched Malicious Domains: {list(matched_malicious)}")
        print(f"  Reason: data spike + domain\n")

# --------------------------
# STEP 6: Export Final Report
# --------------------------
summary_df = pd.DataFrame(suspicious_summary)
summary_df.to_csv("ips_report_filtered.csv", index=False)
print("\n[+] Final report saved to 'suspicious_ips_report_filtered.csv'")