In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
raw_data = pd.read_csv('data.csv')
raw_data.head()

Unnamed: 0,Index,Entity,Year,Records,Organization type,Method,Sources
0,0,21st Century Oncology,2016,2200000,healthcare,hacked,[5][6]
1,1,500px,2020,14870304,social networking,hacked,[7]
2,2,Accendo Insurance Co.,2020,175350,healthcare,poor security,[8][9]
3,3,Adobe Systems Incorporated,2013,152000000,tech,hacked,[10]
4,4,Adobe Inc.,2019,7500000,tech,poor security,[11][12]


In [3]:
def clean_methods(x):
    if (type(x) != str): return "unknown"
    if (x == 'publicly accessible Amazon Web Services (AWS) server'): return 'public aws server'
    return x.lower()

data = raw_data.assign(Method = raw_data['Method'].apply(clean_methods)).drop(["Index", "Sources"], axis='columns')
data.head()

Unnamed: 0,Entity,Year,Records,Organization type,Method
0,21st Century Oncology,2016,2200000,healthcare,hacked
1,500px,2020,14870304,social networking,hacked
2,Accendo Insurance Co.,2020,175350,healthcare,poor security
3,Adobe Systems Incorporated,2013,152000000,tech,hacked
4,Adobe Inc.,2019,7500000,tech,poor security


In [4]:
set(data['Method'])

{'accidentally exposed',
 'accidentally published',
 'accidentally uploaded',
 'data exposed by misconfiguration',
 'hacked',
 'hacked/misconfiguration',
 'improper setting, hacked',
 'inside job',
 'inside job, hacked',
 'intentionally lost',
 'lost / stolen computer',
 'lost / stolen media',
 'misconfiguration/poor security',
 'poor security',
 'poor security / hacked',
 'poor security/inside job',
 'public aws server',
 'ransomware hacked',
 'rogue contractor',
 'social engineering',
 'unknown',
 'unprotected api',
 'unsecured s3 bucket',
 'zero-day vulnerabilities'}

In [5]:
# Note: This is not part of the data cleaning step, just me trying to figure out how 
# to derive the super categories

# The actual implementation of this is the createSuperCat function in app.js

method_categories = {
    'hacked':[
        'hacked',
        'hacked/misconfiguration',
        'improper setting, hacked',
        'poor security / hacked',
        'ransomware hacked',
        'social engineering',
        'zero-day vulnerabilities'
    ],
    'poor security':[
        'poor security',
        'misconfiguration/poor security',
        'unprotected api',
        'unsecured s3 bucket'
    ],
    'accidentally exposed':[
        'accidentally exposed',
        'accidentally published',
        'accidentally uploaded',
        'data exposed by misconfiguration',
        'public aws server'
    ],
    'stolen/lost':[
        'intentionally lost',
        'lost / stolen computer',
        'lost / stolen media'
    ],
    'rogue contractor/employee':[
        'inside job',
        'inside job, hacked',
        'poor security/inside job',
        'rogue contractor'
    ]
}
def get_method_cat(x):
    for cat in method_categories:
        if x in method_categories[cat]:
            return cat
    return 'unknown'

In [6]:
# Confirming that no feature creation was done, only data cleaning
data.head()

Unnamed: 0,Entity,Year,Records,Organization type,Method
0,21st Century Oncology,2016,2200000,healthcare,hacked
1,500px,2020,14870304,social networking,hacked
2,Accendo Insurance Co.,2020,175350,healthcare,poor security
3,Adobe Systems Incorporated,2013,152000000,tech,hacked
4,Adobe Inc.,2019,7500000,tech,poor security


In [7]:
def cleanYr(x):
    return np.mean([float(i) for i in re.findall(r'\d{4}', x)])

data['Year'] = data['Year'].apply(cleanYr)
set(data['Year'])

{2004.0,
 2005.0,
 2006.0,
 2007.0,
 2008.0,
 2009.0,
 2010.0,
 2011.0,
 2012.0,
 2013.0,
 2014.0,
 2014.5,
 2015.0,
 2016.0,
 2017.0,
 2018.0,
 2018.5,
 2019.0,
 2019.5,
 2020.0,
 2021.0,
 2022.0}

In [8]:
data.to_csv('clean_data.csv', index=False)