# JAWS

Commit 1

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
import random as ran

# 1. The Dataset

In [2]:
jaws = pd.read_csv('../data/jaws.csv', encoding='latin-1')
jaws.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [3]:
jaws.shape

(25723, 24)

In [4]:
nan_cols = jaws.isna().sum()

nan_cols[nan_cols>0]

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

# 2. Cleaning The Data

# 2.1 Duplicates and Objective Row

## 2.1.1 Duplicates and Objective row

In [5]:
duplicate_rows = jaws[jaws.duplicated()]

In [6]:
jaws = jaws.drop_duplicates()

In [7]:
jaws.shape

(6312, 24)

## 2.1.2 Change column names

In [8]:
old_colnames = jaws.columns
new_colnames = [name.lower() for name in old_colnames]

new_colnames = [i.replace(' ', '') for i in new_colnames]

jaws.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True)
jaws.rename(columns={'fatal(y/n)' : 'fatality'}, inplace=True)

jaws.columns

Index(['casenumber', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatality', 'time',
       'species', 'investigatororsource', 'pdf', 'hrefformula', 'href',
       'casenumber.1', 'casenumber.2', 'originalorder', 'unnamed:22',
       'unnamed:23'],
      dtype='object')

## 2.1.3 Clean objective column *fatality*



In [9]:
len(jaws.fatality)-jaws.fatality.isna().sum()

5763

In [10]:
jaws.fatality.info()

<class 'pandas.core.series.Series'>
Int64Index: 6312 entries, 0 to 25722
Series name: fatality
Non-Null Count  Dtype 
--------------  ----- 
5763 non-null   object
dtypes: object(1)
memory usage: 98.6+ KB


In [11]:
jaws.fatality.unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [12]:
jaws.fatality.value_counts()

N          4293
Y          1388
UNKNOWN      71
 N            7
M             1
2017          1
N             1
y             1
Name: fatality, dtype: int64

In [13]:
jaws.dropna(subset=['fatality'], inplace=True)



In [14]:
def clean_fatality(x):
    '''
    Function for a dataset to find N and Y in an object column
    and return true or false respectively.
    It returns nothing if else to fill with Nan the values that
    dont match.
    '''
    x=str(x)
    pattern1 = r"[nN]"
    pattern2 = r"[yY]"

    if x == 'UNKNOWN':
        
        return np.nan
    
    elif re.findall(pattern1,x):
        
        return False
    
    elif re.findall(pattern2,x):
        
        return True
        
    
    

In [15]:
jaws.fatality = jaws.fatality.apply(clean_fatality)

jaws.head()

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",False,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,False,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,False,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [16]:
jaws.fatality.value_counts()

False    4301
True     1389
Name: fatality, dtype: int64

In [17]:
jaws.fatality.isna().sum()

73

In [18]:
jaws.shape

(5763, 24)

Commit 2, and 3 (formated)

# 2.2 Date, Case Num, Year... Columns

In [19]:
jaws.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5763 entries, 0 to 6301
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   casenumber            5762 non-null   object 
 1   date                  5763 non-null   object 
 2   year                  5762 non-null   float64
 3   type                  5760 non-null   object 
 4   country               5718 non-null   object 
 5   area                  5359 non-null   object 
 6   location              5280 non-null   object 
 7   activity              5344 non-null   object 
 8   name                  5610 non-null   object 
 9   sex                   5276 non-null   object 
 10  age                   3244 non-null   object 
 11  injury                5742 non-null   object 
 12  fatality              5690 non-null   object 
 13  time                  2802 non-null   object 
 14  species               2948 non-null   object 
 15  investigatororsource 

## 2.2.1 Reformat casenumber

define a function to filter the datetime and keep only the format xxxx.xx.xx

In [20]:
def filter_date(x):

    value = str(x)
    pattern = r"\d{4}\.\d{2}\.\d{2}"

    match = re.search(pattern, value)
    if match:
        clean_date = match.group()
        return clean_date
    else:
        return np.nan

In [21]:
jaws.casenumber = jaws.casenumber.apply(filter_date)

jaws.head(50)

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
5,2018.06.03,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,,"No injury, board bitten",False,,,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.b,2018.06.03.b,6298.0,,
6,2018.06.03,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,True,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.a,2018.06.03.a,6297.0,,
7,2018.05.27,27-May-2018,2018.0,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52,Minor injury to foot. PROVOKED INCIDENT,False,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.27,2018.05.27,6296.0,,
8,2018.05.26,26-May-2018,2018.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,15,Lower left leg bitten,False,17h00,"Bull shark, 6'","K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.b,2018.05.26.b,6295.0,,
9,2018.05.26,26-May-2018,2018.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,M,12,Minor injury to foot,False,14h00,,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.a,2018.05.26.a,6294.0,,


In [22]:
len(jaws.casenumber)-jaws.casenumber.isna().sum()

5634

In [23]:

def filter_date2(fecha):
    fecha=str(fecha)
    try:
        year, month, day = fecha.split('.')
        if month == '00' or day == '00' or len(year) != 4:
            return np.nan
        #pd.to_datetime(fecha)
        return fecha
    except:
        return np.nan

In [24]:
jaws.casenumber = jaws.casenumber.apply(filter_date2)
jaws.casenumber.isna().sum()

808

In [25]:
jaws.dropna(subset=['casenumber'], inplace=True)

In [26]:
jaws.shape

(4955, 24)

In [27]:
def filter_date3(x):
    x=str(x)
    year, month, day = x.split('.')
    year=int(year)
    if year<1960:
        return np.nan
    else:
        return x

In [28]:
jaws.shape

(4955, 24)

In [29]:
#jaws.casenumber = jaws.casenumber.apply(filter_date3)

#jaws.dropna(subset=['casenumber'], inplace=True)

#jaws.shape

*commit 4*

## 2.2.2 Overwrite redundant columns with *casenumber* data

Justification in readme.md

In [30]:
jaws.date = jaws.casenumber

In [31]:
jaws.year = jaws.casenumber

In [32]:
def year_define(x):
    x= str(x)
    year, month, day = x.split('.')
    
    return int(year)

In [33]:
jaws.date = pd.to_datetime(jaws['date'], errors='coerce')

jaws.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4955 entries, 0 to 6170
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   casenumber            4955 non-null   object        
 1   date                  4953 non-null   datetime64[ns]
 2   year                  4955 non-null   object        
 3   type                  4952 non-null   object        
 4   country               4927 non-null   object        
 5   area                  4674 non-null   object        
 6   location              4640 non-null   object        
 7   activity              4676 non-null   object        
 8   name                  4845 non-null   object        
 9   sex                   4587 non-null   object        
 10  age                   3092 non-null   object        
 11  injury                4944 non-null   object        
 12  fatality              4911 non-null   object        
 13  time              

In [34]:
jaws.year = jaws.year.apply(year_define)


## 2.2.3 Overwrite with index

In [35]:
jaws.reset_index(inplace=True)
jaws

Unnamed: 0,index,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0,2018.06.25,2018-06-25,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,1,2018.06.18,2018-06-18,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2,2018.06.09,2018-06-09,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,3,2018.06.08,2018-06-08,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,4,2018.06.04,2018-06-04,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4950,6151,1751.07.27,1751-07-27,1751,Unprovoked,USA,Massachusetts,,Swimming,male,M,,FATAL,True,,,"Pennsylvania Gazette, 8/15/1751",1751.07.27-Massachusetts.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1751.07.27,1751.07.27,152.0,,
4951,6155,1742.12.17,1742-12-17,1742,Unprovoked,,,Carlisle Bay,Swimming,2 impressed seamen,M,,FATAL,True,,,"C. Moore, GSAF",1742.12.17-AdviceSeamen.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1742.12.17,1742.12.17,148.0,,
4952,6156,1738.04.06,1738-04-06,1738,Unprovoked,ITALY,Sicily,Strait of Messina,Swimming,male,M,,FATAL,True,,,"C. Moore, GSAF",1738.04.06.R-Messina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1738.04.06.R,1738.04.06.R,147.0,,
4953,6160,1703.03.26,1703-03-26,1703,Unprovoked,BARBADOS,Southwest coast,Carlisle Bay,Swimming,"Samuel Jennings, a deserter from the British f...",M,19,"Hand and foot severely bitten, surgically ampu...",False,Night,,"W.R.Cutter, Vol.1, p.252",1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1703.03.26,1703.03.26,143.0,,


In [36]:
jaws=jaws.drop('index', axis=1)

In [37]:
jaws.reset_index(inplace=True)

In [38]:
jaws.casenumber = jaws.index
jaws['casenumber.1'] = jaws.index
jaws['casenumber.2'] = jaws.index
jaws.originalorder = jaws.index

jaws = jaws.drop('index', axis=1)

In [39]:
jaws

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0,2018-06-25,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,,
1,1,2018-06-18,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,1,,
2,2,2018-06-09,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2,2,2,,
3,3,2018-06-08,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3,3,3,,
4,4,2018-06-04,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4,4,4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4950,4950,1751-07-27,1751,Unprovoked,USA,Massachusetts,,Swimming,male,M,,FATAL,True,,,"Pennsylvania Gazette, 8/15/1751",1751.07.27-Massachusetts.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4950,4950,4950,,
4951,4951,1742-12-17,1742,Unprovoked,,,Carlisle Bay,Swimming,2 impressed seamen,M,,FATAL,True,,,"C. Moore, GSAF",1742.12.17-AdviceSeamen.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4951,4951,4951,,
4952,4952,1738-04-06,1738,Unprovoked,ITALY,Sicily,Strait of Messina,Swimming,male,M,,FATAL,True,,,"C. Moore, GSAF",1738.04.06.R-Messina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4952,4952,4952,,
4953,4953,1703-03-26,1703,Unprovoked,BARBADOS,Southwest coast,Carlisle Bay,Swimming,"Samuel Jennings, a deserter from the British f...",M,19,"Hand and foot severely bitten, surgically ampu...",False,Night,,"W.R.Cutter, Vol.1, p.252",1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4953,4953,4953,,


*commit 5 and 6*

In [40]:
jaws.time = jaws.time.dropna()
jaws.date = jaws.date.dropna()
jaws.year = jaws.year.dropna()

# 2.3 Age and Sex

## 2.3.1 Filtering Age and Sex

    Filter Age

In [41]:
jaws.age.tail(50)

4905    NaN
4906    NaN
4907     12
4908    NaN
4909    NaN
4910    NaN
4911    NaN
4912    NaN
4913    NaN
4914    NaN
4915     52
4916    NaN
4917    NaN
4918    NaN
4919    NaN
4920    NaN
4921     17
4922    NaN
4923    NaN
4924    NaN
4925    NaN
4926    NaN
4927     22
4928    NaN
4929    NaN
4930    NaN
4931    NaN
4932    NaN
4933    NaN
4934    NaN
4935    NaN
4936    NaN
4937    NaN
4938    NaN
4939    NaN
4940    NaN
4941    NaN
4942    NaN
4943    NaN
4944    NaN
4945    NaN
4946    NaN
4947    NaN
4948    NaN
4949    NaN
4950    NaN
4951    NaN
4952    NaN
4953     19
4954    NaN
Name: age, dtype: object

In [42]:
jaws.age.isna().sum()

1863

In [43]:
def age_filt(x):
    x=str(x)
    age_pattern = r"\b(\d{2})\b"  # Extracts two-digit numbers
    match = re.search(age_pattern, x)
    
    if match:
        age = int(match.group(1))
        return int(age)
    else:
        return np.nan

In [44]:
jaws.age = jaws.age.apply(age_filt)

In [45]:
jaws.head(50)

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0,2018-06-25,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",False,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,,
1,1,2018-06-18,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,False,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,1,,
2,2,2018-06-09,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,False,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2,2,2,,
3,3,2018-06-08,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3,3,3,,
4,4,2018-06-04,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4,4,4,,
5,5,2018-06-03,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,,"No injury, board bitten",False,,,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5,5,5,,
6,6,2018-06-03,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18.0,FATAL,True,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6,6,6,,
7,7,2018-05-27,2018,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52.0,Minor injury to foot. PROVOKED INCIDENT,False,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,7,7,7,,
8,8,2018-05-26,2018,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,15.0,Lower left leg bitten,False,17h00,"Bull shark, 6'","K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,8,8,8,,
9,9,2018-05-26,2018,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,M,12.0,Minor injury to foot,False,14h00,,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,9,9,9,,


In [46]:
jaws.age.isna().sum()

2002

    Filter Sex

In [47]:
jaws.sex.isna().sum()

368

In [48]:
jaws.sex.value_counts()

M      4058
F       523
M         2
N         2
lli       1
.         1
Name: sex, dtype: int64

In [49]:
def filt_sex(x):
    x=str(x)
    pattern1 = r"[mM]"
    pattern2 = r"[fF]"

    if x == 'UNKNOWN':
        
        return np.nan
    
    elif re.findall(pattern1,x):
        
        return True
    
    elif re.findall(pattern2,x):
        
        return False

In [50]:
jaws.sex = jaws.sex.apply(filt_sex)

In [51]:
jaws.sex.isna().sum()

372

In [52]:
jaws.shape

(4955, 24)

## 2.3.2 Delete the rows with both age and sex null (FOR LATER REFERENCE)

jaws.age.isna().sum()

jaws.sex.isna().sum()

bad_index = jaws.casenumber[(jaws.age.isna()) & (jaws.sex.isna())]

bad_index = bad_index.tolist()

len(bad_index)

jaws=jaws.drop(bad_index)

jaws.shape

jaws.sex.isna().sum()

jaws.href[0]

## 2.4 Type

In [53]:
jaws[jaws['type'].isnull()]

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
80,80,2017-09-15,2017,,SAMOA,Upolu Island,Nofoalii,Fishing,male,True,,Injuries to hands and legs,False,Night,,"Samoa Observer, 9/16/2017",2017.09.15.a-Samoa.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,80,80,80,,
4015,4015,1936-09-11,1936,,VIETNAM,,Saigon,Wreck of a sampam,8 crew,True,,FATAL,True,,,"Lansing State Journal, 9/11/1936",1936.09.11-Saigon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4015,4015,4015,,
4662,4662,1890-03-03,1890,,CEYLON,,,Diving,a pearl diver,True,,FATAL,True,,,"The Guardian, 3/3/1890",1890.03.03.R-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4662,4662,4662,,


In [54]:
jaws.type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [55]:
jaws.at[85,'type'] = 'Boat'
jaws.at[382,'type'] = 'Questionable'
jaws.at[4867,'type']= 'Boat'
jaws.at[5705,'type'] = 'Questionable'
jaws.type = jaws.type.str.replace('Boating', 'Boat')
jaws.type = jaws.type.str.replace('Boatomg', 'Boat')
print(jaws.type.unique())

['Boat' 'Unprovoked' 'Invalid' 'Provoked' 'Questionable' 'Sea Disaster'
 nan]


In [56]:
jaws.type = jaws.type.dropna()

## 2.5 Species

In [57]:
def shark_filter(x):
    x = str(x)
    pattern = r'\d'
    
    match = re.search(pattern, x)
    if match:
        return 'unknown'
    
    else:
        return x

In [59]:
jaws.species = jaws.species.apply(shark_filter)
len(jaws.species.unique())

252

In [60]:
jaws.species.unique()

array(['White shark', 'nan', 'unknown', 'Tiger shark', 'Grey reef shark',
       'Invalid incident', 'Shark involvement not confirmed',
       'Questionable', 'Juvenile bull shark', 'Bull shark',
       'Wobbegong shark', 'Blacktip shark', 'Possibly a wobbegong',
       'Injury believed caused by an eel, not a shark',
       'Galapagos shark?', 'small shark', 'Wobbegong shark?',
       'Juvenile nurse shark', 'Tiger shark, female',
       'Some drowned but other may have been killed by blue sharks',
       'Cookiecutter shark', 'Nurse shark',
       'Possibly a juvenile blacktip shark', '"A small shark"',
       'Seven-gill shark', 'Lemon shark', 'Lemon shark pup',
       'Caribbean reef shark', 'a small shark', 'Blue shark',
       'Angel shark', 'dogfish shark', 'Tawny nurse shark', 'Mako shark',
       'Bronze whaler', 'Sandtiger shark', 'Galapagos shark',
       'Hammerhead sp.', 'juvenile shark', 'Hammerhead shark',
       'Hammerhead shark.', 'small nurse shark', 'Blacktip Reef s

In [61]:
def shark_filter2(x):
    x=str(x)
    x=x.lower()
    key_words = ['may','named','by','thought','nan','unidentified','\xa0','edwards','school','&','small','in','bite','large','pregnant','blue','recovered', 'unknown','believed', 'possibly', '?', 'kayak', 'questionable', ' or ', ' and ', 'thin', 'according','reported','remains', 'possiby', 'involvement','thought', 'believed','prior', 'possibly', '?', 'kayak', 'questionable', ' or ', ' and ', 'thin', 'according']
    counter=0
    for e in key_words:
        counter+=1

        if e in x and counter!=len(key_words):

            return 'unknown'
        elif counter == len(key_words):
            counter=0
            return str(x)

In [62]:
jaws.species=jaws.species.apply(shark_filter2)

In [63]:
len(jaws.species.unique())

75

In [64]:
jaws.species = jaws.species.fillna('unknown')

In [65]:
jaws.head(50)

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0.0,2018-06-25,2018.0,Boat,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,False,57.0,"No injury to occupant, outrigger canoe and pad...",False,18h00,white shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0.0,0.0,0.0,,
1,1.0,2018-06-18,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,False,11.0,Minor injury to left thigh,False,14h00 -15h00,unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1.0,1.0,1.0,,
2,2.0,2018-06-09,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,True,48.0,Injury to left lower leg from surfboard skeg,False,07h45,unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2.0,2.0,2.0,,
3,3.0,2018-06-08,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,True,,Minor injury to lower leg,False,,unknown,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3.0,3.0,3.0,,
4,4.0,2018-06-04,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,True,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,unknown,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4.0,4.0,4.0,,
5,5.0,2018-06-03,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,True,,"No injury, board bitten",False,,unknown,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5.0,5.0,5.0,,
6,6.0,2018-06-03,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,True,18.0,FATAL,True,Late afternoon,tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6.0,6.0,6.0,,
7,7.0,2018-05-27,2018.0,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,True,52.0,Minor injury to foot. PROVOKED INCIDENT,False,,unknown,"K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,7.0,7.0,7.0,,
8,8.0,2018-05-26,2018.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,True,15.0,Lower left leg bitten,False,17h00,unknown,"K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,8.0,8.0,8.0,,
9,9.0,2018-05-26,2018.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,True,12.0,Minor injury to foot,False,14h00,unknown,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,9.0,9.0,9.0,,


*commit 7*

## 2.6 Location, Country, Area

In [66]:
jaws.location.isna().sum()

316

In [67]:
bad_index = jaws.casenumber[(jaws.location.isna()) & (jaws.country.isna())]

In [68]:
jaws.casenumber.isna().sum()

1

In [69]:
def clean_country(x):
    x = str(x)
    pattern= re.findall(r'[a-zA-Z\(\)\-\?]+', x)
    if x == 'Unknown':
        return None
    elif pattern:
        return x.lower()
jaws.country = jaws.country.apply(clean_country)
jaws.country = jaws.country.str.replace(r'[(\)\-\?]+', '')
len(jaws.country.unique())

  jaws.country = jaws.country.str.replace(r'[(\)\-\?]+', '')


166

In [70]:
jaws.head()

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0.0,2018-06-25,2018.0,Boat,usa,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,False,57.0,"No injury to occupant, outrigger canoe and pad...",False,18h00,white shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0.0,0.0,0.0,,
1,1.0,2018-06-18,2018.0,Unprovoked,usa,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,False,11.0,Minor injury to left thigh,False,14h00 -15h00,unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1.0,1.0,1.0,,
2,2.0,2018-06-09,2018.0,Invalid,usa,Hawaii,"Habush, Oahu",Surfing,John Denges,True,48.0,Injury to left lower leg from surfboard skeg,False,07h45,unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2.0,2.0,2.0,,
3,3.0,2018-06-08,2018.0,Unprovoked,australia,New South Wales,Arrawarra Headland,Surfing,male,True,,Minor injury to lower leg,False,,unknown,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3.0,3.0,3.0,,
4,4.0,2018-06-04,2018.0,Provoked,mexico,Colima,La Ticla,Free diving,Gustavo Ramos,True,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,unknown,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4.0,4.0,4.0,,


In [71]:
jaws.country = jaws.country.fillna('unknown')

In [72]:
jaws.location = jaws.location.fillna('unknown')

In [73]:
jaws.area = jaws.area.fillna('unknown')

In [74]:
jaws

Unnamed: 0,casenumber,date,year,type,country,area,location,activity,name,sex,age,injury,fatality,time,species,investigatororsource,pdf,hrefformula,href,casenumber.1,casenumber.2,originalorder,unnamed:22,unnamed:23
0,0.0,2018-06-25,2018.0,Boat,usa,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,False,57.0,"No injury to occupant, outrigger canoe and pad...",False,18h00,white shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0.0,0.0,0.0,,
1,1.0,2018-06-18,2018.0,Unprovoked,usa,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,False,11.0,Minor injury to left thigh,False,14h00 -15h00,unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1.0,1.0,1.0,,
2,2.0,2018-06-09,2018.0,Invalid,usa,Hawaii,"Habush, Oahu",Surfing,John Denges,True,48.0,Injury to left lower leg from surfboard skeg,False,07h45,unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2.0,2.0,2.0,,
3,3.0,2018-06-08,2018.0,Unprovoked,australia,New South Wales,Arrawarra Headland,Surfing,male,True,,Minor injury to lower leg,False,,unknown,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3.0,3.0,3.0,,
4,4.0,2018-06-04,2018.0,Provoked,mexico,Colima,La Ticla,Free diving,Gustavo Ramos,True,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,,unknown,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4.0,4.0,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4951,4951.0,1742-12-17,1742.0,Unprovoked,,unknown,Carlisle Bay,Swimming,2 impressed seamen,True,,FATAL,True,,unknown,"C. Moore, GSAF",1742.12.17-AdviceSeamen.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4951.0,4951.0,4951.0,,
4952,4952.0,1738-04-06,1738.0,Unprovoked,italy,Sicily,Strait of Messina,Swimming,male,True,,FATAL,True,,unknown,"C. Moore, GSAF",1738.04.06.R-Messina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4952.0,4952.0,4952.0,,
4953,4953.0,1703-03-26,1703.0,Unprovoked,barbados,Southwest coast,Carlisle Bay,Swimming,"Samuel Jennings, a deserter from the British f...",True,19.0,"Hand and foot severely bitten, surgically ampu...",False,Night,unknown,"W.R.Cutter, Vol.1, p.252",1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4953.0,4953.0,4953.0,,
4954,4954.0,NaT,1580.0,Unprovoked,between portugal & india,unknown,unknown,Man fell overboard from ship. Those on board t...,male,True,,"FATAL. ""Shark tore him to pieces.",True,,unknown,"G.P. Whitley, p. 10",1580.01.10.R-Portugal-India.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4954.0,4954.0,4954.0,,


In [75]:
len(jaws.location.unique())

3347

In [76]:
jaws.location = jaws.location.str.replace(r'[(\)\-\?]+', '')
jaws.area = jaws.area.str.replace(r'[(\)\-\?]+', '')

  jaws.location = jaws.location.str.replace(r'[(\)\-\?]+', '')
  jaws.area = jaws.area.str.replace(r'[(\)\-\?]+', '')


In [77]:
len(jaws.location.unique())

3345

In [78]:
len(jaws.area.unique())

624

In [79]:
jaws.location.isna().sum()

0

In [80]:
jaws.area.isna().sum()

0

In [81]:
jaws.shape

(4956, 24)

In [82]:
jaws.isna().sum()

casenumber                 1
date                       3
year                       1
type                       3
country                    0
area                       0
location                   0
activity                 280
name                     111
sex                      373
age                     2003
injury                    12
fatality                  45
time                    2229
species                    0
investigatororsource      10
pdf                        1
hrefformula                2
href                       1
casenumber.1               1
casenumber.2               1
originalorder              1
unnamed:22              4955
unnamed:23              4954
dtype: int64

*commit 8*

## 2.7 Activity, Injury, time

In [83]:
jaws.activity = jaws.activity.apply(clean_country)
jaws.injury = jaws.injury.apply(clean_country)



In [84]:
jaws.activity = jaws.activity.fillna('unknown')
jaws.injury = jaws.injury.fillna('unknown')

In [87]:
jaws.time = jaws.time.replace(r"[a-zA-Z0-9-]+", np.nan, regex=True)
jaws.time.fillna('unknown',inplace= True)