# Individuals Cyber Attacks Analysis

In [None]:
# Cell 1: Distribution of attack types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('../Data Warehousing ETL/Transformed_Data/individuals_cyber_attacks_europe.csv')

In [None]:
data.columns

## General Analysis

In [None]:
# Group by Year to calculate the number of attacks per year
attacks_per_year = data.groupby('Year').size()

# Display total attacks per year
print("\nTotal Attacks per Year:")
print(attacks_per_year)


In [None]:
# Group by Country to calculate the number of attacks per country
attacks_per_country = data.groupby('Country').size()

# Get the top 10 countries with the most attacks
top_countries_attacks = attacks_per_country.sort_values(ascending=False).head(10)

# Display results for the top 10 countries with the most attacks
print("Top Countries with the Most Attacks:")
print(top_countries_attacks)


In [None]:
# Calculate the average packet length per year
avg_packet_length_per_year = data.groupby('Year')['Packet_Length'].mean()

# Display average packet length per year
print("\nAverage Packet Length per Year:")
print(avg_packet_length_per_year)


In [None]:
# Calculate the average packet length per protocol
avg_packet_length_per_protocol = data.groupby('Protocol')['Packet_Length'].mean()

# Display average packet length per protocol
print("\nAverage Packet Length per Protocol:")
print(avg_packet_length_per_protocol)


In [None]:
# 1. Analyze the most common attack types
common_attack_types = data.groupby('Attack_Type').size().sort_values(ascending=False).head(10)
print("Most Common Attack Types:")
print(common_attack_types)


In [None]:
# 2. Analyze the distribution of severity levels
severity_distribution = data.groupby('Severity_Level').size()
print("\nSeverity Distribution:")
print(severity_distribution)


In [None]:
# 3. Analyze the most frequently targeted device types
common_device_types = data.groupby('Device_Type').size().sort_values(ascending=False)
print("\nMost Common Device Types:")
print(common_device_types)


In [None]:
# 1. Analyze attacks by hour of the day to identify active periods
attacks_per_hour = data.groupby('Hour_of_Day').size()
print("Attacks per Hour of the Day:")
print(attacks_per_hour)


In [None]:
# 2. Analyze attacks by month to check for seasonal trends
attacks_per_month = data.groupby('Month').size()
print("\nAttacks per Month:")
print(attacks_per_month)


In [None]:
# 3. Correlation between attack types and severity levels
severity_vs_attack_type = data.groupby(['Attack_Type', 'Severity_Level']).size().unstack().fillna(0)
print("\nSeverity Levels per Attack Type:")
print(severity_vs_attack_type)


In [None]:
# 1. Analyze Malware Indicators - count of each type of malware indicator
malware_indicators_count = data['Malware_Indicators'].value_counts()

# Display the top 10 malware indicators
print("Malware Indicators Count:")
print(malware_indicators_count.head(10))  # Display the top 10 for brevity


In [None]:
# 3. Analyze Firewall Logs - count of different firewall log types
firewall_logs_count = data['Firewall_Logs'].value_counts()

# Display the firewall logs count
print("\nFirewall Logs Count:")
print(firewall_logs_count)


In [None]:
# 4. Analyze IDS/IPS Alerts - count of different IDS/IPS alerts
ids_ips_alerts_count = data['IDS_IPS_Alerts'].value_counts()

# Display the IDS/IPS alerts count
print("\nIDS/IPS Alerts Count:")
print(ids_ips_alerts_count)


In [None]:
# Optional: To display the results in DataFrames
attacks_per_hour_data = attacks_per_hour.reset_index(name='Count')
attacks_per_month_data = attacks_per_month.reset_index(name='Count')
severity_vs_attack_type_data = severity_vs_attack_type.reset_index()

# Display the DataFrames (this is only required if you want to display them in a visual environment)
print("\nAttacks per Hour DataFrame:\n", attacks_per_hour_data)
print("\nAttacks per Month DataFrame:\n", attacks_per_month_data)
print("\nSeverity vs Attack Type DataFrame:\n", severity_vs_attack_type_data)


## Distribution Plots

In [None]:
# Analyze and plot the top 10 countries targeted by cyber attacks
geo_attack_data = data['Country'].value_counts()

# Plot top 10 most targeted countries
plt.figure(figsize=(10, 6))
geo_attack_data.head(10).plot(kind='bar', color='lightblue')
plt.title('Top 10 Countries by Cyber Attack Frequency')
plt.xlabel('Country')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 4. Attack Type Distribution: Limit to top 5 attack types with adjusted figure size
attack_type_distribution = data['Attack_Type'].value_counts().nlargest(5)

plt.figure(figsize=(8, 5), dpi=100)
attack_type_distribution.plot(kind='bar', color='red')
plt.title('Top 5 Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Plot Browser distribution (top 5)
browser_distribution = data['Browser'].value_counts().nlargest(5)
plt.figure(figsize=(8, 5), dpi=100)
browser_distribution.plot(kind='bar', color='orange')
plt.title('Top 5 Browsers')
plt.xlabel('Browser')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Plot OS distribution (top 5)
os_distribution = data['Operating_System'].value_counts().nlargest(5)
plt.figure(figsize=(8, 5), dpi=100)
os_distribution.plot(kind='bar', color='green')
plt.title('Top 5 Operating Systems')
plt.xlabel('Operating System')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Correlation between different Attack Types and Anomaly Scores
attack_type_anomaly_corr = data.groupby('Attack_Type')['Anomaly_Scores'].mean()
# Plot Average Anomaly Scores by Attack Type
plt.figure(figsize=(10, 6))
attack_type_anomaly_corr.plot(kind='bar')
plt.title('Average Anomaly Scores by Attack Type')
plt.xlabel('Attack Type')
plt.ylabel('Average Anomaly Scores')
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
less_common_attack_types = data.groupby('Packet_Type').size().sort_values(ascending=True).head(10)
plt.figure(figsize=(10, 6))
less_common_attack_types.plot(kind='bar', color='purple')
plt.title('Less Common Attack Types')
plt.xlabel('Packet Type')
plt.ylabel('Number of Attacks')

# Show the plot
plt.show()


In [None]:
# 3. Firewall and IDS/IPS Alerts Analysis: Count and distribution of Firewall Logs and IDS/IPS Alerts
firewall_logs = data['Firewall_Logs'].value_counts().nlargest(5)
ids_ips_alerts = data['IDS_IPS_Alerts'].value_counts().nlargest(5)
# Plotting Firewall Logs
plt.figure(figsize=(8, 5))
firewall_logs.plot(kind='bar', color='red')
plt.title('Top 5 Firewall Logs')
plt.xlabel('Log Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Plotting IDS/IPS Alerts
plt.figure(figsize=(8, 5))
ids_ips_alerts.plot(kind='bar', color='orange')
plt.title('Top 5 IDS/IPS Alerts')
plt.xlabel('Alert Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 4. Traffic and Payload Analysis: Distribution of Traffic Types and Payload Data
traffic_type_distribution = data['Traffic_Type'].value_counts().nlargest(5)
payload_data_distribution = data['Payload_Data'].value_counts().nlargest(5)
# Plot Traffic Type Distribution
plt.figure(figsize=(8, 5))
traffic_type_distribution.plot(kind='bar', color='purple')
plt.title('Top 5 Traffic Types')
plt.xlabel('Traffic Type')
plt.ylabel('Count')
plt.tight_layout()
plt.show()



In [None]:
# 5. Network Segment and Proxy Information Analysis
network_segment_distribution = data['Network_Segment'].value_counts().nlargest(5)
proxy_info_distribution = data['Proxy_Information'].value_counts().nlargest(5)

# Plot Network Segment Distribution
plt.figure(figsize=(8, 5))
network_segment_distribution.plot(kind='bar', color='blue')
plt.title('Top 5 Network Segments')
plt.xlabel('Network Segment')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


In [None]:
# Distribution of Packet Length by Severity
plt.figure(figsize=(10, 6))
sns.boxplot(x='Severity_Level', y='Packet_Length', data=data)
plt.title('Distribution of Packet Length by Severity Level')
plt.tight_layout()

# Show the plot
plt.show()


## Stacked Plots

In [None]:
# 3. Stacked bar plot for attack types vs severity levels
plt.figure(figsize=(10, 6))
severity_vs_attack_type.plot(kind='bar', stacked=True, color=['#ADD8E6', '#87CEEB', '#4682B4', '#1E90FF', '#4169E1', '#000080'])
plt.title('Severity Levels per Attack Type')
plt.xlabel('Attack Type')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)

# Show the plot
plt.show()


In [None]:
# 2. Analyze protocol distribution across severity levels
protocol_vs_severity = data.groupby(['Protocol', 'Severity_Level']).size().unstack().fillna(0)
# 2. Stacked bar plot for protocol vs severity levels
plt.figure(figsize=(10, 6))
protocol_vs_severity.plot(kind='bar', stacked=True, color=['red', 'yellow', 'green'])
plt.title('Protocol vs Severity Levels')
plt.xlabel('Protocol')
plt.ylabel('Number of Attacks')

# Show the plot
plt.show()


In [None]:
# 3. Analyze attack types based on device type (Desktop vs Mobile)
device_vs_attack_type = data.groupby(['Device_Type', 'Attack_Type']).size().unstack().fillna(0)
# 3. Stacked bar plot for device type vs attack types
plt.figure(figsize=(10, 6))
device_vs_attack_type.plot(kind='bar', stacked=True)
plt.title('Device Type vs Attack Types')
plt.xlabel('Device Type')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=0)

# Show the plot
plt.show()


In [None]:
# 3. Protocol vs Severity Level with adjusted size
protocol_vs_severity = data.groupby(['Protocol', 'Severity_Level']).size().unstack().fillna(0)
protocol_vs_severity = protocol_vs_severity.loc[protocol_vs_severity.sum(axis=1).nlargest(5).index]  # Limit to top 5 protocols

plt.figure(figsize=(8, 5), dpi=100)
protocol_vs_severity.plot(kind='bar', stacked=True)
plt.title('Top 5 Protocols vs Severity Levels')
plt.xlabel('Protocol')
plt.ylabel('Number of Attacks')
plt.tight_layout()
plt.show()


In [None]:
# Limit the top 5 attack types for anomalies analysis
anomalies_vs_attack_type = data.groupby(['Attack_Type', 'Anomaly_Scores']).size().unstack().fillna(0)

# Limit to top 5 attack types based on total count
top_attack_types = anomalies_vs_attack_type.sum(axis=1).nlargest(5).index
anomalies_vs_attack_type = anomalies_vs_attack_type.loc[top_attack_types]

# Limit the number of unique anomaly scores to top 5 by total occurrence
top_anomaly_scores = anomalies_vs_attack_type.sum(axis=0).nlargest(5).index
anomalies_vs_attack_type = anomalies_vs_attack_type[top_anomaly_scores]

# Plot with a fixed figure size and DPI
plt.figure(figsize=(8, 5), dpi=100)
anomalies_vs_attack_type.plot(kind='bar', stacked=True)
plt.title('Top 5 Anomalies vs Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Anomalies Count')

# Adjust layout to fit
plt.tight_layout()
plt.show()


In [None]:
# 7. User Information and Network Segment analysis: Limit to top 5 categories with adjusted figure sizes
user_vs_attack_type = data.groupby(['User_Information', 'Attack_Type']).size().unstack().fillna(0)
user_vs_attack_type = user_vs_attack_type.loc[user_vs_attack_type.sum(axis=1).nlargest(5).index]

plt.figure(figsize=(8, 5), dpi=100)
user_vs_attack_type.plot(kind='bar', stacked=True)
plt.title('Top 5 User Information vs Attack Types')
plt.xlabel('User Information')
plt.ylabel('Number of Attacks')
plt.tight_layout()
plt.show()


In [None]:
# Plot Network Segment vs Attack Type (top 5)
network_segment_vs_attack_type = data.groupby(['Network_Segment', 'Attack_Type']).size().unstack().fillna(0)
network_segment_vs_attack_type = network_segment_vs_attack_type.loc[network_segment_vs_attack_type.sum(axis=1).nlargest(5).index]

plt.figure(figsize=(8, 5), dpi=100)
network_segment_vs_attack_type.plot(kind='bar', stacked=True)
plt.title('Top 5 Network Segments vs Attack Types')
plt.xlabel('Network Segment')
plt.ylabel('Number of Attacks')
plt.tight_layout()
plt.show()


In [None]:
# Group data by severity level and action taken
severity_action_data = data.groupby(['Severity_Level', 'Action_Taken']).size().unstack().fillna(0)

# Plot distribution of severity levels by actions taken
severity_action_data.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Severity Levels by Actions Taken')
plt.xlabel('Severity Level')
plt.ylabel('Number of Attacks')
plt.tight_layout()
plt.show()


In [None]:
# Attack Types by Country
geo_attack_type_data = data.groupby(['Country', 'Attack_Type']).size().unstack().fillna(0)

# Plot attack types by country
plt.figure(figsize=(12, 6))
geo_attack_type_data.head(10).plot(kind='bar', stacked=True)
plt.title('Attack Types by Country (Top 10 Countries)')
plt.xlabel('Country')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# For geographical heatmaps, you would need to install and use folium or geopandas for mapping the attack locations.

## Time Series

In [None]:
# 1. Time-based analysis: Hourly, daily, monthly, yearly trends
data['Timestamp'] = pd.to_datetime(data['Timestamp'])  # Ensure the Timestamp is datetime format
data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['Year'] = data['Timestamp'].dt.year

In [None]:
# Group by time components
attacks_per_hour = data.groupby('Hour').size()
attacks_per_day = data.groupby('Day').size()
attacks_per_month = data.groupby('Month').size()
attacks_per_year = data.groupby('Year').size()


In [None]:
# Convert Timestamp to datetime with the correct format
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%Y-%m-%d %H:%M:%S')

# Analyze trends over time (monthly attack frequency)
data['YearMonth'] = data['Timestamp'].dt.to_period('M')
time_series_data = data.groupby('YearMonth').size()

# Plot the time series of attacks per month
plt.figure(figsize=(10, 5), dpi=100)
time_series_data.plot()
plt.title('Cyber Attacks Over Time (Monthly Frequency)')
plt.xlabel('Month')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# 2. Line plot for attacks per month
plt.figure(figsize=(10, 4))
attacks_per_month.plot(kind='line', marker='o', color='green')
plt.title('Attacks per Month')
plt.xlabel('Month')
plt.ylabel('Number of Attacks')

# Show the plot
plt.show()


In [None]:
# Weekly Attack Trends
data['Week'] = data['Timestamp'].dt.to_period('W')
weekly_attack_data = data.groupby('Week').size()
# Plot Weekly Attack Trends
plt.figure(figsize=(10, 5))
weekly_attack_data.plot()
plt.title('Cyber Attacks Over Time (Weekly Frequency)')
plt.xlabel('Week')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=90)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Attack Types Over Time
attack_type_trend = data.groupby([data['Timestamp'].dt.to_period('M'), 'Attack_Type']).size().unstack().fillna(0)
# Plot Attack Types Over Time (Monthly)
plt.figure(figsize=(12, 6))
attack_type_trend.plot()
plt.title('Attack Types Over Time (Monthly)')
plt.xlabel('Month')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=90)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Yearly trend
plt.figure(figsize=(10, 4), dpi=100)
attacks_per_year.plot(kind='bar', color='purple')
plt.title('Attacks by Year')
plt.xlabel('Year')
plt.ylabel('Number of Attacks')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Monthly trend
plt.figure(figsize=(10, 4), dpi=100)
attacks_per_month.plot(kind='bar', color='green')
plt.title('Attacks by Month')
plt.xlabel('Month')
plt.ylabel('Number of Attacks')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Daily trend
plt.figure(figsize=(10, 4), dpi=100)
attacks_per_day.plot(kind='bar', color='blue')
plt.title('Attacks by Day of Month')
plt.xlabel('Day')
plt.ylabel('Number of Attacks')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:

# Set plot size for better readability
plt.figure(figsize=(10, 4))

# 1. Bar plot for attacks per hour of the day
attacks_per_hour.plot(kind='bar', color='skyblue')
plt.title('Attacks per Hour of the Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Attacks')

# Show the plot
plt.show()
