In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import wrangle as w

* Outline your goals and define deliverables.

- the goal is to create a classification model that will be able to predict the breach type based on the feature provided/ created

- jupyter notebook 

* Document initial thoughts and hypotheses.

- My initials thoughts are to create additional columns from the time and location to create more direct features. My hypothesis is that the location and time will be able to predict the type of breach.

In [17]:
# retrieve previously downloaded csv
df = pd.read_csv('breach_report.csv') 

In [18]:
# look for nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867 entries, 0 to 866
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Name of Covered Entity            867 non-null    object 
 1   State                             864 non-null    object 
 2   Covered Entity Type               867 non-null    object 
 3   Individuals Affected              867 non-null    int64  
 4   Breach Submission Date            867 non-null    object 
 5   Type of Breach                    867 non-null    object 
 6   Location of Breached Information  867 non-null    object 
 7   Business Associate Present        867 non-null    object 
 8   Web Description                   0 non-null      float64
dtypes: float64(1), int64(1), object(7)
memory usage: 61.1+ KB


In [19]:
df.isnull().sum()

Name of Covered Entity                0
State                                 3
Covered Entity Type                   0
Individuals Affected                  0
Breach Submission Date                0
Type of Breach                        0
Location of Breached Information      0
Business Associate Present            0
Web Description                     867
dtype: int64

* three nulls are found in states 
* Web description will be dropped 

In [25]:
df.State

0      CA
1      IA
2      PA
3      MA
4      IA
       ..
862    WI
863    MI
864    OH
865    TX
866    MA
Name: State, Length: 867, dtype: object

In [26]:
df

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description
0,"Brightline, Inc.",CA,Business Associate,8432,5/26/23,Hacking/IT Incident,Network Server,Yes,
1,Iowa Department of Health and Human Services,IA,Health Plan,833,5/26/23,Unauthorized Access/Disclosure,Paper/Films,Yes,
2,Lancaster Orthopedic Group,PA,Healthcare Provider,500,5/26/23,Hacking/IT Incident,Network Server,No,
3,Harvard Pilgrim Health Care,MA,Health Plan,2550922,5/24/23,Hacking/IT Incident,Network Server,No,
4,"UI Community Home Care, a subsidiary of Univer...",IA,Healthcare Provider,67897,5/24/23,Hacking/IT Incident,"Electronic Medical Record, Network Server",No,
...,...,...,...,...,...,...,...,...,...
862,Wisconsin Department of Health Services,WI,Health Plan,2868,6/4/21,Hacking/IT Incident,Email,No,
863,East Jordan Family Health Center,MI,Healthcare Provider,1151,6/4/21,Hacking/IT Incident,Network Server,Yes,
864,"AtriCure, Inc. Group Health Plan",OH,Health Plan,2487,6/4/21,Hacking/IT Incident,Email,No,
865,"Gastroenterology Consultants, PA",TX,Healthcare Provider,161698,3/19/21,Hacking/IT Incident,Network Server,No,


In [27]:
df.State = df.State.fillna('PR')

* I believe these nulls are for Puerto rico, which would explain why they do not have a state listed --  will leave in dataset 

In [28]:
df.State.nunique()

52

* all states are listed and additional PR
* Breach dates 

In [29]:
# adjust column names - lowercase and underscored
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(" ", "_")

In [30]:
df.head()

Unnamed: 0,name_of_covered_entity,state,covered_entity_type,individuals_affected,breach_submission_date,type_of_breach,location_of_breached_information,business_associate_present,web_description
0,"Brightline, Inc.",CA,Business Associate,8432,5/26/23,Hacking/IT Incident,Network Server,Yes,
1,Iowa Department of Health and Human Services,IA,Health Plan,833,5/26/23,Unauthorized Access/Disclosure,Paper/Films,Yes,
2,Lancaster Orthopedic Group,PA,Healthcare Provider,500,5/26/23,Hacking/IT Incident,Network Server,No,
3,Harvard Pilgrim Health Care,MA,Health Plan,2550922,5/24/23,Hacking/IT Incident,Network Server,No,
4,"UI Community Home Care, a subsidiary of Univer...",IA,Healthcare Provider,67897,5/24/23,Hacking/IT Incident,"Electronic Medical Record, Network Server",No,


In [31]:
df["month"] = df.breach_submission_date.dt.strftime("%m")

AttributeError: Can only use .dt accessor with datetimelike values

In [32]:
df["year"] = df.breach_submission_date.dt.strftime("%Y")

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
df.year.unique()

In [None]:
df.month.nunique()

In [None]:
df[df.year == '2021'].count()

In [None]:
df[df.year == '2022'].count()

In [None]:
df[df.year == '2023'].count()

In [None]:
df.location_of_breached_information.unique()

In [None]:
# create columns for multiple locations -- encoded 


# Check if commas exist in the 'Column1' and create a new column 'Has_Comma'
df['multi_breached_location'] = df['location_of_breached_information'].str.contains(',')

# Convert boolean values to 1 or 0
df['multi_breached_location'] = df['multi_breached_location'].astype(int)

# Display the modified DataFrame
pd.DataFrame(df)

In [None]:
# will classify what type of attack based on other features 
df.type_of_breach.unique()

In [None]:
df[["month", "year"]] = df[["month", "year"]].astype("int")

In [None]:
# create dummies
dummy_df = pd.get_dummies(df[["season"]],
                          drop_first=True)
df = pd.concat([df, dummy_df], axis=1)
# # rename columns
# df = df[["customer_id","partner_Yes", "dependents_Yes", "paperless_billing_Yes", "automatic_payment", "tenure","churn_Yes"]]

df = df.rename(columns={"season_Spring": "spring", "season_Summer": "summer", "season_Winter": "winter"})
# # df for modeling

In [None]:
df

In [None]:
# create seasons 
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Unknown'

In [None]:
# Apply the function to create a new 'Season' column
df['season'] = df['month'].apply(get_season)

In [None]:
df[df.season == "Unknown"]

In [None]:
target = "type_of_breach"

In [None]:
def split_data(df, target_variable="type_of_breach"):
    '''
    Takes in two arguments the dataframe name and the ("target_variable" - must be in string format) to stratify  and 
    return train, validate, test subset dataframes will output train, validate, and test in that order.
    '''
    train, test = train_test_split(df, #first split
                                   test_size=.2, 
                                   random_state=123, 
                                   stratify= df[target_variable])
    train, validate = train_test_split(train, #second split
                                    test_size=.25, 
                                    random_state=123, 
                                    stratify=train[target_variable])
    return train, validate, test

In [None]:
train, validate, test = split_data(df, target)

In [None]:
train

In [None]:
# clean df
df = w.clean_df()
# split data
train, validate, test = w.split_data(df)