<a href="https://colab.research.google.com/github/Jamieren/ML-with-sentinel/blob/main/tryingwithSentinel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
csv_data = """TimeGenerated,Computer,EventID,EventType,LogonType,AccountName,IpAddress,Status
2024-05-20T12:34:56.789Z,DESKTOP01,4624,Security,2,JohnDoe,192.168.1.100,Success
2024-05-20T13:45:12.345Z,LAPTOP02,4624,Security,2,AliceSmith,10.0.0.5,Success
2024-05-20T14:56:23.678Z,SERVER01,4624,Security,3,BobJohnson,172.16.0.10,Success
2024-05-20T15:07:34.912Z,DESKTOP03,4625,Security,3,EveAnderson,192.168.0.20,Failure
2024-05-20T16:18:45.123Z,DESKTOP04,4624,Security,2,CharlieBrown,192.168.1.50,Success
2024-05-20T17:29:56.234Z,LAPTOP05,4624,Security,2,LucyWilliams,10.0.0.15,Success
2024-05-20T18:40:12.345Z,DESKTOP06,4625,Security,3,DavidLee,192.168.1.30,Failure
2024-05-20T19:51:23.456Z,SERVER02,4624,Security,2,SarahMiller,172.16.0.20,Success
2024-05-20T20:02:34.567Z,DESKTOP07,4624,Security,2,MikeTaylor,192.168.1.70,Success
2024-05-20T21:13:45.678Z,LAPTOP08,4624,Security,2,EmilyBrown,10.0.0.25,Success"""

with open('user_sign_in_activities.csv', 'w') as f:
    f.write(csv_data)


In [10]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Generate random timestamps within a range
start_date = datetime(2024, 5, 1)
end_date = datetime(2024, 5, 31)
timestamps = [start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(100)]

# Generate random computer names
computers = ['DESKTOP01', 'LAPTOP02', 'SERVER01', 'DESKTOP03', 'DESKTOP04', 'LAPTOP05', 'DESKTOP06', 'SERVER02', 'DESKTOP07', 'LAPTOP08']
computer_names = [random.choice(computers) for _ in range(100)]

# Generate random account names
account_names = ['JohnDoe', 'AliceSmith', 'BobJohnson', 'EveAnderson', 'CharlieBrown', 'LucyWilliams', 'DavidLee', 'SarahMiller', 'MikeTaylor', 'EmilyBrown']
account_names = [random.choice(account_names) for _ in range(100)]

# Generate random event types
event_types = ['Security', 'Login', 'Logout', 'Error']
event_types = [random.choice(event_types) for _ in range(100)]

# Generate random success/failure statuses
statuses = ['Success', 'Failure']
statuses = [random.choice(statuses) for _ in range(100)]

# Create DataFrame
data = pd.DataFrame({
    'TimeGenerated': timestamps,
    'Computer': computer_names,
    'EventType': event_types,
    'AccountName': account_names,
    'Status': statuses
})

# Save DataFrame to CSV
data.to_csv('user_sign_in_activities_100.csv', index=False)


In [13]:
# Read the CSV file into a DataFrame
data = pd.read_csv('user_sign_in_activities_100.csv')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn
sns.set_style("whitegrid")

# Time Series Plot
plt.figure(figsize=(10, 6))
data['TimeGenerated'] = pd.to_datetime(data['TimeGenerated'])
data.set_index('TimeGenerated').resample('D').size().plot(label='Daily Sign-in Activities', marker='o')
plt.title('Number of Sign-in Activities Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Sign-in Activities')
plt.legend()
plt.show()

# Bar Plot
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Computer', order=data['Computer'].value_counts().index)
plt.title('Sign-in Activities by Computer')
plt.xlabel('Computer')
plt.ylabel('Number of Sign-in Activities')
plt.xticks(rotation=45)
plt.show()

# Pie Chart
plt.figure(figsize=(8, 8))
data['EventType'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Sign-in Activities by Event Type')
plt.ylabel('')
plt.show()

# Count Plot
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='Status')
plt.title('Count of Sign-in Activities by Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.show()


In [None]:
# to see what associate with the status

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn
sns.set_style("whitegrid")

# Bar Plot for Computer vs. Status
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Computer', hue='Status')
plt.title('Sign-in Activities by Computer and Status')
plt.xlabel('Computer')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Status')
plt.show()

# Bar Plot for EventType vs. Status
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='EventType', hue='Status')
plt.title('Sign-in Activities by Event Type and Status')
plt.xlabel('Event Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Status')
plt.show()

# Bar Plot for AccountName vs. Status
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='AccountName', hue='Status', order=data['AccountName'].value_counts().index)
plt.title('Sign-in Activities by Account Name and Status')
plt.xlabel('Account Name')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.legend(title='Status')
plt.show()


In [None]:
# trying random forst

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read the CSV file into a DataFrame
data = pd.read_csv('user_sign_in_activities_100.csv')

data['TimeGenerated'] = pd.to_datetime(data['TimeGenerated'])
X_train['TimeGenerated'] = pd.to_datetime(X_train['TimeGenerated'])
X_test['TimeGenerated'] = pd.to_datetime(X_test['TimeGenerated'])

X_train['TimeGenerated'] = X_train['TimeGenerated'].astype(int) / 10**9
X_test['TimeGenerated'] = X_test['TimeGenerated'].astype(int) / 10**9

# Encode categorical variables
data_encoded = pd.get_dummies(data, columns=['Computer', 'EventType', 'AccountName'], drop_first=True)

# Splitting the dataset into features and target variable
X = data_encoded.drop(columns=['Status'])
y = data_encoded['Status'].apply(lambda x: 1 if x == 'Success' else 0)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the 'TimeGenerated' column to a numeric type
X_train['TimeGenerated'] = pd.to_numeric(X_train['TimeGenerated'])
X_test['TimeGenerated'] = pd.to_numeric(X_test['TimeGenerated'])

# Normalizing the features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Model Training
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train_normalized, y_train)

# Model Testing
y_pred_rf = random_forest.predict(X_test_normalized)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print("Classification Report:")
print(report_rf)

In [None]:

# Display Predictions
predictions_df = X_test.copy()
predictions_df['TrueStatus'] = y_test
predictions_df['PredictedStatus'] = y_pred_rf
predictions_df['PredictedStatus'] = predictions_df['PredictedStatus'].apply(lambda x: 'Success' if x == 1 else 'Failure')

print(predictions_df.head(10))  # Display the first 10 predictions


In [None]:
# Save Predictions to CSV
predictions_df = X_test.copy()
predictions_df['TrueStatus'] = y_test
predictions_df['PredictedStatus'] = y_pred_rf
predictions_df['PredictedStatus'] = predictions_df['PredictedStatus'].apply(lambda x: 'Success' if x == 1 else 'Failure')
predictions_df['TrueStatus'] = predictions_df['TrueStatus'].apply(lambda x: 'Success' if x == 1 else 'Failure')

predictions_df.to_csv('predictions.csv', index=False)

# Generate Plots
# Plot for True vs. Predicted Statuses
plt.figure(figsize=(10, 6))
sns.countplot(data=predictions_df, x='TrueStatus', hue='PredictedStatus')
plt.title('True vs. Predicted Statuses')
plt.xlabel('True Status')
plt.ylabel('Count')
plt.legend(title='Predicted Status')
plt.show()

# Distribution of Predictions
plt.figure(figsize=(10, 6))
sns.countplot(data=predictions_df, x='PredictedStatus')
plt.title('Distribution of Predicted Statuses')
plt.xlabel('Predicted Status')
plt.ylabel('Count')
plt.show()

# Distribution of True Statuses
plt.figure(figsize=(10, 6))
sns.countplot(data=predictions_df, x='TrueStatus')
plt.title('Distribution of True Statuses')
plt.xlabel('True Status')
plt.ylabel('Count')
plt.show()