In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data_breaches = pd.read_csv('../data/data_breaches.csv')
data_breaches.head()

In [None]:
data_breaches.info()

In [None]:
data_breaches.isnull().sum()

In [None]:
data_breaches.corr(numeric_only=True)

In [None]:
sns.heatmap(data_breaches.corr(numeric_only=True), annot=True, cmap="coolwarm")


In [None]:
data_breaches.plot(kind = 'scatter', x = 'data sensitivity', y = 'records_lost', figsize = (12,8))
plt.title('Data Sensitivity vs. Records Lost');

# Let's Explore The Records Lost Data

In [None]:
data_breaches['records_lost'].sum()

In [None]:
data_breaches['records_lost'].mean()

In [None]:
data_breaches['records_lost'].median()

In [None]:
data_breaches['records_lost'].max()

In [None]:
data_breaches.nlargest(1,'records_lost')

In [None]:
data_breaches['records_lost'].min()

In [None]:
data_breaches.nsmallest(1,'records_lost')

In [None]:
data_breaches['records_lost'].max()- data_breaches['records_lost'].min()

Variance and Standard Devivation (Records Lost)

In [None]:
data_breaches['rl_deviation'] = data_breaches.records_lost - data_breaches.records_lost.mean()
data_breaches.head()

In [None]:
data_breaches.records_lost.std()

In [None]:
data_breaches['rl_deviation'].mean()

In [None]:
data_breaches['squared_rl_deviation'] = data_breaches['rl_deviation']**2
data_breaches

Population Standard Deviation (Records Lost)

In [None]:
np.sqrt(data_breaches['squared_rl_deviation'].mean())

In [None]:
data_breaches.records_lost.var(ddof = 0)

In [None]:
data_breaches.records_lost.std(ddof = 0)

z-scores (Records Lost)

In [None]:
data_breaches['rl_z-score'] = (data_breaches.records_lost - data_breaches.records_lost.mean()) / data_breaches.records_lost.std(ddof = 0)


In [None]:
data_breaches['rl_z-score'].std()


Quartiles and Quantiles/Percentiles (Records Lost)

In [None]:
data_breaches.records_lost.quantile(q = 0.25)

In [None]:
data_breaches.records_lost.quantile(q = 0.5)

In [None]:
data_breaches.records_lost.quantile(q = 0.75)

In [None]:
data_breaches.records_lost.describe()

Interquartile Range (Records Lost)

In [None]:
data_breaches.records_lost.quantile(q = 0.75) - data_breaches.records_lost.quantile(q = 0.25)

Visualization of Records Lost

In [None]:
fig,ax = plt.subplots(figsize = (10,6))

plt.hist(
    data = data_breaches,
    x = 'records_lost',
    edgecolor = 'black',
    linewidth = 2
);
plt.xlabel('Records Lost')
plt.ylabel('Numbers of Companies')
plt.title('Histogram of Records Lost Due To Data Breach');

# Organizations That Lost The Most Records

In [None]:
data_breaches.groupby('organization')['records_lost']

In [None]:
fig = go.Figure(go.Funnel(
    y = ["Public", "Private", "Independent", "Preparatory", "Homeschool","Reform"],
    x = [757, 669, 90, 76, 3, 1]))

fig.show()

In [None]:
'''
data_breaches = px.data.gapminder()
fig = px.line(data_breaches, x="organization", y="records_lost", title='Organization With The Most Records Lost')
fig.show()

# Exploring Sector Data

In [None]:
sector = pd.DataFrame(data_breaches.sector.value_counts().reset_index())
sector

In [None]:
sector.columns = ['Sector', 'Companies Hacked']
sector.head(3)

In [None]:
top10_sectors = sector[sector['Sector'].isin(['web', 'government','health','finance','retail','telcoms','app','tech','transport','gaming'])]

In [None]:
top10_sectors = top10_sectors.groupby(['Sector']).sum().sort_values(['Companies Hacked'],ascending=False)
top10_sectors

In [None]:
px.bar(top10_sectors.iloc[:10,:1], text_auto=True,title='Top 10 Hacked Sectors')

Total Records Lost By Sector

In [None]:
px.treemap(data_frame=data_breaches, path=['sector'], values='records_lost')

Types of data breaches by sector

In [None]:
breach_type = pd.DataFrame(data_breaches[['sector','method']].value_counts().reset_index())


In [None]:
breach_type.columns = ['Sector', 'Method','Breaches']
breach_type.head(3)

In [None]:
nested_pie =  breach_type.sort_values('Breaches', ascending=False).iloc[:]

fig = px.sunburst(nested_pie, path=['Sector','Method'], values='Breaches',
                  color='Breaches',
                  color_continuous_scale='rdbu',template="plotly_white",
                  )
fig.update_layout(height=650, title = "Data Breaches By Sector" , title_x = 0.47,)
fig.show()

# Exploring The Sources That Broke The Stories

In [None]:
source = pd.DataFrame(data_breaches['source name'].value_counts().reset_index())
source.head()

In [None]:
source.columns = ['Source', 'Stories Broken']
source.head(3)

In [None]:
top10_sources = source[source['Source'].isin(['ZDNet', 'The Register','The Guardian','BBC News','Guardian','Bleeping Computer','Tech Crunch','Reuters','Tech Crunch','Data Breaches'])]

In [None]:
top10_sources = top10_sources.groupby(['Source']).sum().sort_values(['Stories Broken'],ascending=False)
top10_sources

In [None]:
px.bar(top10_sources.iloc[:10,:1], text_auto=True,title='Top 10 Sources For Data Breach Stories')

In [None]:
data_breaches = px.data.gapminder()
fig = px.area(data_breaches, x="year", y="records_lost", color="method", line_group="sector")
fig.show()

# Exploring Data Sensitivity

Data Sensitivity By Sector

In [None]:
pd.crosstab(data_breaches['data sensitivity'],
            data_breaches100['sector'],
            normalize='index').plot(kind = 'bar',
                                    edgecolor = 'black',
                                    width = 0.75,
                                    stacked = True)
plt.ylabel('Proportion');

Data Sensitivity By Method

In [None]:
pd.crosstab(data_breaches['data sensitivity'],
            data_breaches100['sector'],
            normalize='index').plot(kind = 'bar',
                                    edgecolor = 'black',
                                    width = 0.75,
                                    stacked = True)
plt.ylabel('Proportion');

# Data Breaches By Date

What Moth Had The Most Data Breaches?

In [None]:
# Convert the Date column to datetime
data_breaches['date'] = pd.to_datetime(data_breaches['date'], errors='coerce')

# Extract the month from the Date
data_breaches['month'] = data_breaches['date'].dt.month

# Group by Month and calculate the count of breaches
breach_month = data_breaches.groupby('month').agg({'organization':'count'}).reset_index()

# Sort by the month
breach_month = breach_month.sort_values('month')

# Plot
plt.figure(figsize=(10,6))
plt.bar(breach_month['month'], breach_month['organization'], color='lightblue')
plt.xlabel('month')
plt.ylabel('Number of Data Breaches')
plt.title('Number of Data Breaches by month')
plt.xticks(ticks=range(1,13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.show()

What Day of the Week Had the Most Breaches?

In [None]:
# Extract the day of the week from the Date (0 = Monday, 6 = Sunday)
data_breaches['Day of Week'] = data_breaches['date'].dt.dayofweek

# Group by Day of Week and calculate the count of breaches
data_day_of_week = data_breaches.groupby('Day of Week').agg({'organization':'count'}).reset_index()

# Sort by the day of the week
data_day_of_week = data_day_of_week.sort_values('Day of Week')

# Plot
plt.figure(figsize=(10,6))
plt.bar(data_day_of_week['Day of Week'], data_day_of_week['organization'], color='lightgreen')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Data Breaches')
plt.title('Number of Data Breaches by Day of the Week')
plt.xticks(ticks=range(0,7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

# Top 3 Companies With The Most Records Lost By Year

In [None]:
breaches = data_breaches.copy()

breaches['year']=data_breaches['date'].str.split('-').str[0]
breaches['month']=data_breaches['date'].str.split('-').str[1]
breaches['day']=data_breaches['date'].str.split('-').str[2]

breaches['year']=breaches['year'].astype(int)
breaches['month']=breaches['month'].astype(int)
breaches['day']=breaches['day'].astype(int)


In [None]:
breaches.head(3)

In [None]:
'''
xlwriter = pd.ExcelWriter('data_breaches.xlsx')
breaches.to_excel(xlwriter, sheet_name='layoffs')
xlwriter.close()

In [None]:
breaches.groupby(["year"],sort=False)["records_lost"].max()

In [None]:
g = sns.catplot(x='year', y='records_lost', data=breaches)
g.fig.set_figwidth(10)
g.fig.set_figheight(6)

In [None]:
top_3_year_wise = breaches.groupby(['year','organization']).records_lost.agg([max])
top_3_year_wise

In [None]:
g = top_3_year_wise["max"].groupby(['year'],group_keys=False)
top_3_year_wise2 = g.apply(lambda x : x.sort_values(ascending=False).head(3))
top_3_year_wise2

In [None]:
g.nlargest(3)


In [None]:
top_3_year_wise2.index
top_3_year_wise2.values

In [None]:
top_3_year_wise3 = pd.DataFrame()
top_3_year_wise3["records_lost"] = top_3_year_wise2.values
top_3_year_wise3

In [None]:
top_3_year_wise3.columns = ['Records Lost']

In [None]:
y = []
c = []
for i,j in top_3_year_wise2.index:
    y.append(i)
    c.append(j)
top_3_year_wise3["Year"] = y
top_3_year_wise3["Organization"] = c
top_3_year_wise3

In [None]:
px.bar(top_3_year_wise3,x='Year',y='Records Lost',color='Organization', title='Top 3 Companies with the most Records Lost Year-wise',text_auto=True)


# Records Lost Per Year By Sector

In [None]:
records_lost_year_sector_wise  = breaches.groupby(["year","sector"]).records_lost.sum()
records_lost_year_sector_wise.head()

In [None]:
len(breaches.sector.unique())

In [None]:
records_lost_year_sector_wise_year = []
records_lost_year_sector_wise_sector = []
for i,j in records_lost_year_sector_wise.index:
    records_lost_year_sector_wise_year.append(i)
    records_lost_year_sector_wise_sector.append(j)

In [None]:
records_lost_year_sector_wise2 = pd.DataFrame({
    "Year": records_lost_year_sector_wise_year,
    "Sector": records_lost_year_sector_wise_sector,
    "records_lost": records_lost_year_sector_wise.values
})

In [None]:
records_lost_year_sector_wise2.head()

In [None]:
records_lost_year_sector_wise2.columns = ['Year', 'Sector', 'Records Lost']

In [None]:
records_lost_year_sector_wise2.sort_values(["Year","Records Lost"],ascending=False,inplace=True)


In [None]:
px.bar(records_lost_year_sector_wise2,x='Year',y='Records Lost',color='Sector',text='Sector',
      title='Records Lost By Sector Year-wise')

# Records Lost Per Year By Method

In [None]:
records_lost_year_method_wise  = breaches.groupby(["year","method"]).records_lost.sum()
records_lost_year_method_wise.head()

In [None]:
len(breaches.method.unique())

In [None]:
records_lost_year_method_wise_year = []
records_lost_year_method_wise_method = []
for i,j in records_lost_year_method_wise.index:
    records_lost_year_method_wise_year.append(i)
    records_lost_year_method_wise_method.append(j)

In [None]:
records_lost_year_method_wise2 = pd.DataFrame({
    "Year": records_lost_year_method_wise_year,
    "Method": records_lost_year_method_wise_method,
    "records_lost": records_lost_year_method_wise.values
})

In [None]:
records_lost_year_method_wise2.head()

In [None]:
records_lost_year_method_wise2.columns = ['Year', 'Method', 'Records Lost']

In [None]:
records_lost_year_method_wise2.sort_values(["Year","Records Lost"],ascending=False,inplace=True)

In [None]:
px.bar(records_lost_year_method_wise2,x='Year',y='Records Lost',color='Method',text='Method',
      title='Records Lost By Method Year-wise')

# What Are The Most Common Words In The Data Breach Stories?

In [None]:
text = data_breaches.story[:].dropna().to_list()
text = " ".join(text)

wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", regexp = r'\S+').generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Turn Notebook Into A Violà  Dashboard

In [None]:
#!voila "data_breaches.ipynb"