In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split


* Additional dataset from [Data.World](https://data.world/health/health-data-breaches/workspace/file?filename=breach_report.csv)

In [2]:
df = pd.read_csv('breach_report2.csv', encoding='latin-1')

In [3]:
df.isnull().sum()

Name of Covered Entity                0
State                                11
Covered Entity Type                  42
Individuals Affected                 23
Breach Submission Date                0
Type of Breach                       14
Location of Breached Information     11
Business Associate Present            0
Web Description                     398
dtype: int64

In [4]:
df.columns = df.columns.str.lower()

df.columns = df.columns.str.replace(" ", "_")
# dropped Web Descs column
df = df.drop(columns=["web_description"])

In [5]:
df.isnull().sum()

name_of_covered_entity               0
state                               11
covered_entity_type                 42
individuals_affected                23
breach_submission_date               0
type_of_breach                      14
location_of_breached_information    11
business_associate_present           0
dtype: int64

In [6]:
 # fix nans for state column
df.state = df.state.fillna('PR')

In [7]:
df = df.dropna(subset=['covered_entity_type', 'breach_submission_date', 'type_of_breach', 'individuals_affected'])
    # fix nans for state column
df.state = df.state.fillna('PR')

In [8]:
df.isnull().sum()

name_of_covered_entity              0
state                               0
covered_entity_type                 0
individuals_affected                0
breach_submission_date              0
type_of_breach                      0
location_of_breached_information    0
business_associate_present          0
dtype: int64

In [9]:
# dates are obj will change to date 
df.breach_submission_date = df.breach_submission_date.astype('datetime64')

# add month columns
df["month"] = df.breach_submission_date.dt.strftime("%m")

# Check if commas exist in the 'Column1' and create a new column 'Has_Comma'
df['multi_breached_location'] = df['location_of_breached_information'].str.contains(',')

# Convert boolean values to 1 or 0
df['multi_breached_location'] = df['multi_breached_location'].astype(int)

In [10]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Unknown'
    
# change month to int
df[["month"]] = df[["month"]].astype("int")

# add season column
df['season'] = df['month'].apply(get_season)

# rename column
df["breach"] = df.type_of_breach
df["breach_type"] = df.type_of_breach
df = df.drop(columns=["type_of_breach", "month"])

# create dummies
dummy_df = pd.get_dummies(df[["season", "business_associate_present", "breach"]])
                        # drop_first=True)
df = pd.concat([df, dummy_df], axis=1)


# rename columns
df = df.rename(columns={"name_of_covered_entity": "entity_name", "covered_entity_type": "entity_type", "individuals_affected": "number_affected", "breach_submission_date": "date", "location_of_breached_information": "location", "season_Spring": "spring", "season_Summer": "summer", "season_Winter": "winter", "breach_Hacking/IT Incident": "hacking_or_it_incident", "breach_Improper Disposal": "improper_disposal", "breach_Theft": "theft", "breach_Loss": "loss", "breach_Unauthorized Access/Disclosure": "unauthorized_access_or_disclosure", "business_associate_present_Yes": "business_associate"})
#     df.columns = df.columns.str.lower()
# drop entity name
df = df.drop(columns=['entity_name', 'breach'])

# fix nans for state column
df.state = df.state.fillna('PR')

# df for modeling

# Display the modified DataFrame
df = pd.DataFrame(df)


In [11]:
df = df.rename(columns={"name_of_covered_entity": "entity_name", "covered_entity_type": "entity_type", "individuals_affected": "number_affected", "breach_submission_date": "date", "location_of_breached_information": "location", "season_Spring": "spring", "season_Autumn": "autumn", "season_Summer": "summer", "season_Winter": "winter", "breach_Hacking/IT Incident": "hacking_or_it_incident", "breach_Improper Disposal": "improper_disposal", "breach_Theft": "theft", "breach_Loss": "loss", "breach_Unauthorized Access/Disclosure": "unauthorized_access_or_disclosure", "business_associate_present_Yes": "business_associate"})

In [12]:
df

Unnamed: 0,state,entity_type,number_affected,date,location,business_associate_present,multi_breached_location,season,breach_type,autumn,...,"breach_Loss, Unknown",breach_Other,"breach_Other, Theft","breach_Other, Theft, Unauthorized Access/Disclosure","breach_Other, Unauthorized Access/Disclosure","breach_Other, Unknown",theft,"breach_Theft, Unauthorized Access/Disclosure",unauthorized_access_or_disclosure,breach_Unknown
0,TX,Healthcare Provider,1000.0,2009-10-21,Paper/Films,No,0,Autumn,Theft,1,...,0,0,0,0,0,0,1,0,0,0
1,MO,Healthcare Provider,1000.0,2009-10-28,Network Server,No,0,Autumn,Theft,1,...,0,0,0,0,0,0,1,0,0,0
2,AK,Healthcare Provider,501.0,2009-10-30,"Other, Other Portable Electronic Device",No,1,Autumn,Theft,1,...,0,0,0,0,0,0,1,0,0,0
3,DC,Health Plan,3800.0,2009-11-17,Laptop,No,0,Autumn,Loss,1,...,0,0,0,0,0,0,0,0,0,0
4,CA,Healthcare Provider,5166.0,2009-11-20,Desktop Computer,No,0,Autumn,Theft,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,IL,Healthcare Provider,540.0,2016-10-07,Other Portable Electronic Device,No,0,Autumn,Unauthorized Access/Disclosure,1,...,0,0,0,0,0,0,0,0,1,0
1696,OK,Healthcare Provider,2938.0,2016-10-07,Network Server,No,0,Autumn,Hacking/IT Incident,1,...,0,0,0,0,0,0,0,0,0,0
1697,IL,Healthcare Provider,992.0,2016-10-10,Email,No,0,Autumn,Unauthorized Access/Disclosure,1,...,0,0,0,0,0,0,0,0,1,0
1698,IN,Healthcare Provider,1466.0,2016-10-14,"Email, Network Server",No,1,Autumn,Hacking/IT Incident,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
def split_data(df, target_variable="breach_type"):
    '''
    Takes in two arguments the dataframe name and the ("target_variable" - must be in string format) to stratify  and 
    return train, validate, test subset dataframes will output train, validate, and test in that order.
    '''
    train, test = train_test_split(df, #first split
                                   test_size=.2, 
                                   random_state=123, 
                                   stratify= df[target_variable])
    train, validate = train_test_split(train, #second split
                                    test_size=.25, 
                                    random_state=123, 
                                    stratify=train[target_variable])
    return train, validate, test

In [16]:
df.breach_type.value_counts()

Theft                                                         706
Unauthorized Access/Disclosure                                383
Hacking/IT Incident                                           215
Loss                                                          119
Other                                                          77
Improper Disposal                                              50
Theft, Unauthorized Access/Disclosure                          24
Loss, Theft                                                    15
Unknown                                                         9
Hacking/IT Incident, Unauthorized Access/Disclosure             8
Other, Unauthorized Access/Disclosure                           7
Improper Disposal, Loss, Theft                                  3
Other, Theft                                                    3
Improper Disposal, Loss                                         3
Loss, Unauthorized Access/Disclosure                            3
Loss, Othe

In [21]:
# Define the list of desired breach types
desired_types = ['Theft', 'Unauthorized Access/Disclosure', 'Hacking/IT Incident', 'Loss', 'Improper Disposal']

# Drop the rows that do not have desired breach types
df = df[df['breach_type'].isin(desired_types)]


In [22]:
df.breach_type.value_counts()

Theft                             706
Unauthorized Access/Disclosure    383
Hacking/IT Incident               215
Loss                              119
Improper Disposal                  50
Name: breach_type, dtype: int64

In [23]:
# baseline accuracy
baseline_accuracy = (train.breach_type == "Theft").mean()
round(baseline_accuracy, 2)

NameError: name 'train' is not defined

In [24]:
df.shape

(1473, 43)

In [None]:
train.location

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

# tree classifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

# rainforest classifier
from sklearn.ensemble import RandomForestClassifier

# linear regession classifier
from sklearn.linear_model import LogisticRegression

# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
import wrangle as w

def model_df(df):
       
    # Keep columns
    df = df[["state", "breach_type", "location", "multi_breached_location", "summer"]]
    
    # create dummies
    dummy_df = pd.get_dummies(df[["state", "location", "entity_type"]],
                            drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    df = df.drop(columns=["state","location"])

    return df

def create_x_y(train, validate, test, target):
    """
    This function creates x and y variables for either a decision tree or a random forest, 
    by using the unsplit df, target variable columns name and column to drop, for multiple columns that need to be 
    dropped create a list of the columns0
    The arguments taken in are train, validate, test, target, drop_col=[])
    The function returns x_train, y_train, x_validate, y_validate, x_test, y_test
    """
    # separates train target variable
    x_train = train.drop(columns=[target])
    y_train = train[target]
    # validate 
    x_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    # test
    x_test = test.drop(columns=[target])
    y_test = test[target]
    
    return x_train, y_train, x_validate, y_validate, x_test, y_test

# final test model

def best_model(x_train, y_train, x_validate, y_validate, x_test, y_test):
    '''
    ! WARNING!: Only use this for your final model 
    This function provides a quick print output of the baseling accuracy train, validation, test scores based on your classifier, for easy viewing.
    The function takes the following arguments: object name (clf), x_train, y_train, x_validate, y_validate, x_test, y_test
    '''
    rf = RandomForestClassifier(random_state=3, min_samples_leaf=1, max_depth=10)
    rf = rf.fit(x_train, y_train)
    # model.fit(x, y)
    print(f'''
    Accuracy of {rf} on training set: {round(rf.score(x_train, y_train), 2)}
    Accuracy of {rf} on validation set: {round(rf.score(x_validate, y_validate), 2)}
    Accuracy of {rf} on test set: {round(rf.score(x_test, y_test), 2)}
    ''')


In [None]:
df = model_df(df)

In [None]:
train, validate, test = split_data(df)

In [None]:
target = "breach_type"
x_train, y_train, x_validate, y_validate, x_test, y_test = create_x_y(train, validate, test, target)

In [None]:
x_train

In [None]:
best_model(x_train, y_train, x_validate, y_validate, x_test, y_test)