In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [26]:
df = pd.read_csv('Data breacher.csv')



In [27]:
df.head(20)


Unnamed: 0.1,Unnamed: 0,Entity,Year,Records,Organization type,Method,Sources
0,0,21st Century Oncology,2016,2200000,healthcare,hacked,[5][6]
1,1,500px,2020,14870304,social networking,hacked,[7]
2,2,Accendo Insurance Co.,2020,175350,healthcare,poor security,[8][9]
3,3,Adobe Systems Incorporated,2013,152000000,tech,hacked,[10]
4,4,Adobe Inc.,2019,7500000,tech,poor security,[11][12]
5,5,Advocate Medical Group,2017,4000000,healthcare,lost / stolen media,[13][14]
6,6,AerServ (subsidiary of InMobi),2018,75000,advertising,hacked,[15]
7,7,"Affinity Health Plan, Inc.",2013,344579,healthcare,lost / stolen media,[16][17]
8,8,Airtel,2019,320000000,telecommunications,poor security,[18]
9,9,Air Canada,2018,20000,transport,hacked,[19]


In [28]:
#check data type
print(df.dtypes)


Unnamed: 0            int64
Entity               object
Year                 object
Records              object
Organization type    object
Method               object
Sources              object
dtype: object


In [29]:
#Data cleaning 
df = df.dropna() #handle missing value 
df = df.drop_duplicates() #remove duplicates

In [30]:
# Clean the 'Year' column by extracting the numeric values
df['Year'] = df['Year'].str.extract('(\d+)', expand=False)
df['Year'] = pd.to_numeric(df['Year'])



In [31]:
# Count the frequency of breached organization types
organization_counts = df['Organization type'].value_counts()
organization_counts.head(20)


web               53
healthcare        47
financial         38
government        29
retail            27
tech              19
academic          13
telecoms          12
gaming            11
social network     8
hotel              8
transport          7
military           7
energy             4
media              3
restaurant         3
mobile carrier     2
social media       2
tech, retail       2
telecom            2
Name: Organization type, dtype: int64

In [32]:
# Count the frequency of breach methods
method_counts = df['Method'].value_counts()
print(method_counts)

hacked                                                  190
poor security                                            43
lost / stolen media                                      33
accidentally published                                   20
inside job                                               19
lost / stolen computer                                   16
unknown                                                   7
improper setting, hacked                                  2
poor security/inside job                                  2
intentionally lost                                        1
accidentally exposed                                      1
publicly accessible Amazon Web Services (AWS) server      1
hacked/misconfiguration                                   1
rogue contractor                                          1
ransomware hacked                                         1
unprotected api                                           1
zero-day vulnerabilities                

In [33]:
# Group the data by year and count the number of attacks for each year
attack_counts_by_year = df.groupby('Year').size()

# Sort the attack counts in descending order
sorted_attack_counts = attack_counts_by_year.sort_values(ascending=False)

# Print the number of attacks for each year in descending order
print("Number of Attacks by Year (Descending Order):")
print(sorted_attack_counts)




Number of Attacks by Year (Descending Order):
Year
2011    34
2020    31
2019    30
2015    28
2013    28
2014    26
2018    25
2012    23
2016    20
2010    19
2008    16
2009    13
2021    13
2007    12
2017     9
2006     7
2005     6
2022     5
2004     2
dtype: int64


In [34]:
# Group the data by year and method, and count the number of attacks for each combination
attack_counts_by_year_method = df.groupby(['Year', 'Method']).size()

# Reset the index to transform the grouped data into a DataFrame
attack_counts_by_year_method = attack_counts_by_year_method.reset_index(name='Attack Count')

# Sort the data by the highest year attack counts in descending order
sorted_attack_counts = attack_counts_by_year_method.sort_values(by='Attack Count', ascending=False)

# Print the methods used for attacks, sorted by the highest year attack counts
print("Methods Used for Attacks, Sorted by Highest Year Attack:")
print(sorted_attack_counts)


Methods Used for Attacks, Sorted by Highest Year Attack:
    Year                  Method  Attack Count
49  2015                  hacked            24
45  2014                  hacked            22
40  2013                  hacked            21
61  2018                  hacked            18
29  2011                  hacked            17
..   ...                     ...           ...
52  2015      social engineering             1
55  2016  lost / stolen computer             1
56  2016           poor security             1
57  2016  poor security / hacked             1
93  2022                 unknown             1

[94 rows x 3 columns]


In [35]:
# 4. Identify the top breached organizations

# Count the frequency of breached organizations
top_breached_organizations = df['Organization type'].value_counts().head(10)

# Display the top breached organizations
print("\nTop Breached Organizations:")
print(top_breached_organizations)


Top Breached Organizations:
web               53
healthcare        47
financial         38
government        29
retail            27
tech              19
academic          13
telecoms          12
gaming            11
social network     8
Name: Organization type, dtype: int64


In [36]:
# Group the data by year and calculate the count of breaches for each year
breaches_by_year = df.groupby('Year').size()

# Print the total number of breaches for each year
print("Total number of breaches by year:")
print(breaches_by_year)


Total number of breaches by year:
Year
2004     2
2005     6
2006     7
2007    12
2008    16
2009    13
2010    19
2011    34
2012    23
2013    28
2014    26
2015    28
2016    20
2017     9
2018    25
2019    30
2020    31
2021    13
2022     5
dtype: int64


In [43]:
# Group the data by date, method, and organization type, and count the number of breaches for each combination
grouped_data = df.groupby(['Year', 'Method', 'Organization type']).size().reset_index(name='Count')

# Get the top 5 breach methods
top_methods = grouped_data['Method'].value_counts().head(5).index.tolist()

# Get the top 10 organization types
top_organization_types = grouped_data['Organization type'].value_counts().head(10).index.tolist()

# Filter the data based on the top methods and organization types
filtered_data = grouped_data[grouped_data['Method'].isin(top_methods) & grouped_data['Organization type'].isin(top_organization_types)]

# Sort the filtered data by the count of breaches in descending order
sorted_data = filtered_data.sort_values(by='Count', ascending=False)

# Retrieve the top 5 dates with the highest count of breaches
top_dates = sorted_data['Year'].head(5).tolist()

# Print the top 5 dates
print("Top 5 years of Highest Recorded Breaches:")
for date in top_dates:
    print(date)


Top 5 years of Highest Recorded Breaches:
2013
2011
2015
2015
2014
