<a href="https://colab.research.google.com/github/Khalidhussainn/Final_Year_Project/blob/main/ML/SLA/Suspicious_Login_Detection(Final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Suspicious Login Detection in Windows (Using Wazuh Archive Logs)
---
## Step by Step Guide for Beginners

### Step 0: Mount Google Drive (Colab Only)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Step 1: Import Libraries

### Step 2: Load Data

In [None]:

file_path = "/content/drive/MyDrive/Colab Notebooks/SLA/archive_logs_for_SLA.csv"
df = pd.read_csv(file_path)
print("Shape of dataset:", df.shape)
df.head()


### Step 3: Basic Data Exploration & Cleaning

In [None]:

print("Columns:", df.columns.tolist())
df.info()
print(df.isnull().sum())
duplicate_counts = df.duplicated().sum()
print("Total duplicate rows:", duplicate_counts)
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)


### Step 4: Feature Selection

In [None]:

cols_to_keep = [
    "@timestamp", "data.win.system.eventID", "agent.name", "data.win.eventdata.logonType"
]
df = df[cols_to_keep]
print("Shape after column selection:", df.shape)
df.head()


### Step 5: Value Counts for Categorical Columns

In [None]:

for col in cols_to_keep[1:]:
    print(f"\nUnique value counts for {col}:\n", df[col].value_counts())


### Step 6: Data Visualization

In [None]:


sns.countplot(x="data.win.eventdata.logonType", data=df)
plt.title("Frequency of Logon Types")
plt.show()

top_agents = df["agent.name"].value_counts().nlargest(10)
sns.barplot(x=top_agents.values, y=top_agents.index, palette="magma")
plt.title("Top 10 Active Agents")
plt.xlabel("Logon Event Count")
plt.ylabel("Agent Name")
plt.show()


### Step 7: Timestamp Processing & Feature Engineering

In [None]:

df['@timestamp'] = df['@timestamp'].str.replace(' @ ', ' ', regex=False)
df['@timestamp'] = pd.to_datetime(df['@timestamp'], errors='coerce')
df = df.dropna(subset=['@timestamp'])
df['year'] = df['@timestamp'].dt.year
df['month'] = df['@timestamp'].dt.month
df['day'] = df['@timestamp'].dt.day
df['hour'] = df['@timestamp'].dt.hour
df['minute'] = df['@timestamp'].dt.minute
df['second'] = df['@timestamp'].dt.second
df['day_of_week'] = df['@timestamp'].dt.dayofweek
df.head()


### Step 8: Label Encoding

In [None]:

label_cols = ["agent.name", "data.win.eventdata.logonType", "data.win.system.eventID"]
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
df.head()


### Step 9: Prepare Features

In [None]:

feature_columns = [
    "data.win.system.eventID", "agent.name", "data.win.eventdata.logonType",
    "year", "month", "day", "hour", "minute", "second", "day_of_week"
]
X = df[feature_columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


### Step 10: Train Isolation Forest

In [None]:

iso_forest = IsolationForest(n_estimators=100, contamination=0.015, random_state=42)
iso_forest.fit(X_scaled)
df['Anomaly'] = iso_forest.predict(X_scaled)
df['Anomaly'] = df['Anomaly'].map({1: 0, -1: 1})
print(df['Anomaly'].value_counts())


### Step 11: Anomaly Visualization

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x="Anomaly", data=df, hue="Anomaly", palette={0: "green", 1: "red"}, legend=False)

plt.xticks([0, 1], ["Normal", "Anomaly"])
plt.title("Normal vs Suspicious (Anomaly) Logins")
plt.xlabel("Login Type")
plt.ylabel("Count")
plt.show()


In [None]:
# Step 13: Visualization - Scatter Plot (Hour vs Logon Type)
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=df['hour'],
    y=df['data.win.eventdata.logonType'],
    hue=df['Anomaly'],
    palette={0: "green", 1: "red"},
    alpha=0.6
)
plt.title("Suspicious vs Normal Logins by Hour and Logon Type")
plt.xlabel("Hour of Day")
plt.ylabel("Logon Type (Encoded)")
plt.legend(title="Login Type", labels=["Normal", "Anomaly"])
plt.show()


### Step 12: PCA Visualization

In [None]:
# Step 14: Visualization - PCA Plot (All Features in 2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x="PCA1", y="PCA2", hue="Anomaly", data=df, palette={0: "green", 1: "red"}, alpha=0.6)
plt.title("PCA: Normal vs Suspicious Logins")
plt.legend(title="Login Type", labels=["Normal", "Anomaly"])
plt.show()

**Apply LOF**




In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Apply LOF with contamination level set to 1.5%
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.015)
y_pred = lof.fit_predict(X_scaled)

# LOF returns -1 for outliers, 1 for inliers; map them
df['Anomaly'] = (y_pred == -1).astype(int)

# View anomaly distribution
print(df['Anomaly'].value_counts())


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x="Anomaly", data=df, hue="Anomaly", palette={0: "green", 1: "red"}, legend=False)
plt.xticks([0, 1], ["Normal", "Anomaly"])
plt.title("Normal vs Suspicious (Anomaly) Logins")
plt.xlabel("Login Type")
plt.ylabel("Count")
plt.show()



In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x="PCA1", y="PCA2", hue="Anomaly", data=df, palette={0: "green", 1: "red"}, alpha=0.6)
plt.title("PCA: Normal vs Suspicious Logins (LOF)")
plt.legend(title="Login Type", labels=["Normal", "Anomaly"])
plt.show()


In [None]:
# Step 15: Save Model and Preprocessing for Later Use (Optional)
export_dir = '/content/drive/MyDrive/Colab Notebooks/SLA/Data_exports'
os.makedirs(export_dir, exist_ok=True)
joblib.dump(iso_forest, f'{export_dir}/iso_forest_model.pkl')
joblib.dump(scaler, f'{export_dir}/scaler.pkl')
for col, le in label_encoders.items():
    joblib.dump(le, f"{export_dir}/label_encoder_{col}.pkl")
joblib.dump(feature_columns, f"{export_dir}/feature_columns.pkl")
event_id_mapping = {
    "4624": "Successful Logon",
    "4625": "Failed Logon"
}
joblib.dump(event_id_mapping, f'{export_dir}/event_id_mapping.pkl')

print(f"All models/artifacts exported to: {export_dir}")