In [2]:
import numpy as np
import pandas as pd
import re
import warnings




In [3]:
#Importing the CSV into a dataframe
df_sharks = pd.read_csv("../data/Shark_attacks.csv",encoding="unicode_escape")


In [4]:
#first we look at the shape and print a sample
df_sharks.shape
df_sharks.sample(frac=0.4)


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
19845,,,,,,,,,,,...,,,,,,,,,,
18147,,,,,,,,,,,...,,,,,,,,,,
16938,,,,,,,,,,,...,,,,,,,,,,
7729,0,,,,,,,,,,...,,,,,,,,,,
22490,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5978,1861.02.12.R,Reported 12-Feb-1861,1861.0,Unprovoked,EQUATORIAL GUINEA / CAMEROON,Fernando Po Island,,Swimming,William Looney,M,...,,"Daily Southern Cross, 2/12/1861",1861.02.12.R-Looney.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1861.02.12.R,1861.02.12.R,325.0,,
8100,0,,,,,,,,,,...,,,,,,,,,,
18277,,,,,,,,,,,...,,,,,,,,,,
7760,0,,,,,,,,,,...,,,,,,,,,,


In [5]:

# check the empty columns and drop them, the 2 columns mentioned below seemed empty
df_sharks["Unnamed: 22"].isna().value_counts()
df_sharks["Unnamed: 23"].isna().value_counts()
df_sharks = df_sharks.drop(columns=["Unnamed: 22","Unnamed: 23"])


In [6]:
#check columns with irrelevent information and or not clear information

df_sharks = df_sharks.drop(columns=["href formula","pdf","href","original order"])

In [7]:
#check for duplicates
df_sharks[df_sharks.duplicated()]
df_sharks.duplicated().sum()


19418

In [8]:
#drop duplicates
df_sharks = df_sharks.drop_duplicates()

In [9]:
df_sharks.shape

(6305, 18)

In [10]:
#check and drop all the rows where EVERY value in that row is missing
df_sharks = df_sharks.dropna(axis = 0, how = 'all')
df_sharks.shape
# the dataset went from 25723 row to 6304 after deleting the empty ones

(6304, 18)

In [11]:
#it's important to check column´s name in order to avoid keyerror
print(df_sharks.columns)

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'Case Number.1', 'Case Number.2'],
      dtype='object')


In [12]:
#check if the 3 columns named "Case Number" are equal ==> they are not but it's because of mismatching format date
df_sharks["Case Number.1"].equals(df_sharks["Case Number.2"])
df_sharks["Case Number"].equals(df_sharks["Case Number.1"])
df_sharks["Case Number"].equals(df_sharks["Case Number.2"])

False

In [13]:
#it's not an information that I see useful, so I'll delete the columns, and also because I already have 2 other columns with dates
df_sharks = df_sharks.drop(columns=["Case Number.1","Case Number.2"])


In [14]:

df_sharks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF"
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com"
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com"
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF"
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper


In [15]:
#after deleting the the columns that are not relevant at this point, let's check the columns names again and clean them
print(df_sharks.columns)

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source'],
      dtype='object')


In [16]:
#printing the columns' names helps detect the hidden caracters that can't be detected from looking directly at the table.
#It seems that the colums "Sex" and "Species" have spaces at the end. let's clean them

In [17]:
df_sharks.rename(columns={"Sex ":"Sex"}, inplace=True)
df_sharks.rename(columns={"Species ":"Species"}, inplace=True)

In [18]:

print(df_sharks.columns)

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species', 'Investigator or Source'],
      dtype='object')


In [19]:

df_sharks.shape

(6304, 16)

In [20]:
#making sure that all rows missing the columns in the code are dropped
df_sharks.dropna(axis=0, inplace=True, how="all", subset=["Age", "Type","Activity","Country","Fatal (Y/N)","Species"])

In [21]:
#now let's look at the content of each column and see if there are missing or mismatching info
#the 3 columns "Case Number,Date and Year" seem a little bit similar, so to make it more efficient and less redundant, 
#we'll split the Date column into 2 (month,year)
#and then drop the column Case Number, the Year column will be automatically replaced
df_sharks[["Month"]] = df_sharks["Date"].str.lower().str.extract(r'-(\w{3})-')
df_sharks[["Year"]] = df_sharks["Date"].str.extract(r'(\d{4})')

In [22]:
#now i'll drop the "Case Number" column, since i have a year and a month columns already
df_sharks = df_sharks.drop(columns=["Case Number"])



In [23]:
df_sharks.shape
df_sharks.head()
# now the dataset went from 8703 row and 24 columns to  (6304, 16) after deleting the empty ones

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,Month
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",jun
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",jun
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",jun
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",jun
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,jun


In [24]:
#now we start cleaning the content, and check for empty fields
pd.isna(df_sharks).sum()

Date                         0
Year                        19
Type                         4
Country                     50
Area                       455
Location                   540
Activity                   544
Name                       210
Sex                        565
Age                       2831
Injury                      28
Fatal (Y/N)                539
Time                      3354
Species                   2838
Investigator or Source      17
Month                      910
dtype: int64

In [25]:
#At this point, the "Type" column doesnt need much cleaning
df_sharks['Type'].isna().value_counts()

False    6298
True        4
Name: Type, dtype: int64

In [26]:
#check for unique values
df_sharks["Type"].unique()


array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [27]:
#Replacing cells that contain the word "boat" by "boating" to group them, also transforming Invalid into unknown

df_sharks["Type"] = df_sharks["Type"].str.replace("Boating","Boat")
df_sharks["Type"] = df_sharks["Type"].str.replace("Boatomg","Boat")
df_sharks["Type"] = df_sharks["Type"].str.replace("Invalid","Unknown")

df_sharks["Type"].unique()
df_sharks["Type"].value_counts()



Unprovoked      4595
Provoked         574
Unknown          547
Boat             341
Sea Disaster     239
Questionable       2
Name: Type, dtype: int64

In [28]:
#Creating a new column to summerize the type coloumn into a provoked accident or not, assuming that "boating" 
#and "sea disaster" are nor provoked, this column will contain that values (Yes/No)
provoked_= ["Unprovoked","Boat","Sea Disaster"]
df_sharks["Provoked"] = df_sharks["Type"].str.contains('|'.join(provoked_)).apply(lambda x: "N" if x == True else "Y")

In [29]:
#Replacing the empty fields in the Country column with "Unknown"
#Tranforming all the strings in this column into uppercase for unicity
df_sharks["Country"].isna().sum()
df_sharks["Country"]= df_sharks["Country"].str.upper()
df_sharks["Country"].fillna("Unknown", inplace=True)

#not much to change in the country column in my opinion, because more than 50% of incidents are happening in 7 countries 


In [30]:
#check country value count
check = df_sharks["Country"].value_counts()

print(check.to_string())

USA                                      2229
AUSTRALIA                                1338
SOUTH AFRICA                              579
PAPUA NEW GUINEA                          134
NEW ZEALAND                               128
BRAZIL                                    112
BAHAMAS                                   109
MEXICO                                     89
ITALY                                      71
FIJI                                       65
PHILIPPINES                                61
REUNION                                    60
NEW CALEDONIA                              53
Unknown                                    50
CUBA                                       46
MOZAMBIQUE                                 45
SPAIN                                      44
INDIA                                      40
EGYPT                                      38
CROATIA                                    34
JAPAN                                      34
PANAMA                            

In [50]:
df_sharks.shape

(6302, 17)

In [51]:
df_sharks["Country"].isna().sum()

0

In [52]:
#checking the column "Area"
#Replacing the empty fields in the "Area" column with "Unknown"

df_sharks["Area"].isna().sum()
df_sharks["Area"].fillna("Unknown", inplace=True)


In [53]:
df_sharks["Area"].isna().sum()

0

In [54]:
#checking the column "Location"
#Replacing the empty fields in the "Location" column with "Unknown"
df_sharks["Location"].isna().sum()
df_sharks["Location"].fillna("Unknown", inplace=True)

In [55]:


#not much change in the columns "Area,Location,Country", except from filling all the blanks with "unknown"

In [56]:
#checking again the number of columns and dropping some, since I created new ones
df_sharks.head()


Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,Month,Provoked
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",jun,N
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",jun,N
2,09-Jun-2018,2018,Unknown,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",jun,Y
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,0,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",jun,N
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,0,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,jun,Y


In [57]:
#checking the column "Activity"
df_sharks["Activity"].value_counts()
df_sharks["Activity"].isna().sum()

#we have some cell that contains the words surf==> to be grouped with Surfing, same for cells that contain 
#swim, bathing...==> Swimming

0

In [58]:
for row in df_sharks["Activity"]:
    #Replacing all swimming related strings with "Swimming"
    if re.search(".*wimm.*", str(row)) or re.search(".*swim.*", str(row)):
        df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Swimming")
    if row=="Bathing" or row=="Floating":
        df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Swimming")
    if re.search(".*(T|t)reading.*", str(row)):
        df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Swimming")
    #==> Replace surfing related strings with "Surfing"
    if re.search(".*surf.*", str(row)): 
        df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Surfing")

  df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Surfing")
  df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Swimming")
  df_sharks["Activity"] = df_sharks["Activity"].str.replace(row, "Swimming")


In [59]:
df_sharks["Activity"].value_counts()
df_sharks["Activity"].isna().sum()


0

In [60]:
#replacing the nans values with "Unknown"
df_sharks["Activity"].fillna("Unknown", inplace=True)

In [61]:
#checking the column "Name"
#Replacing the empty fields in the "Name" column with "Unknown"
df_sharks["Name"].isna().sum()
df_sharks["Name"].fillna("Unknown", inplace=True)

In [62]:
#unify the column "Sex"
df_sharks["Sex"].unique()

array(['F', 'M', 'Unknown'], dtype=object)

In [63]:
#transforming values like "N" or "M " ==> "M"
#the rest of the column are replaced by the value "Unknown"

df_sharks["Sex"] = df_sharks["Sex"].str.replace("N", "M")
df_sharks["Sex"] = df_sharks["Sex"].str.replace("M ", "M")
df_sharks["Sex"] = df_sharks["Sex"].str.replace("lli", "Unknown")
df_sharks["Sex"] = df_sharks["Sex"].str.replace(".", "Unknown")
df_sharks["Sex"] = df_sharks["Sex"].str.replace("nan", "Unknown")
df_sharks['Sex'] = df_sharks['Sex'].fillna('Unknown')
df_sharks["Sex"].unique()

  df_sharks["Sex"] = df_sharks["Sex"].str.replace(".", "Unknown")


array(['F', 'M', 'Unknown'], dtype=object)

In [64]:

df_sharks.sample(frac=0.5)



Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,Month,Provoked
1873,25-Mar-2002,2002,Unprovoked,USA,Hawaii,"Brenecke Beach, Po'ipu, Kaua'i",Body-boarding,Hoku Aki,M,17,Left leg severed below knee,N,12h00,Tiger shark,"G. Kubota, Honolulu Star Bulletin",mar,N
5449,11-Sep-1907,1907,Unprovoked,CROATIA,"Split-Dalmatia Count,","Sucurja, Hvar Island,",Swimming,female,F,0,FATAL,Y,,,"C. Moore, GSAF",sep,N
1523,25-Nov-2005,2005,Unprovoked,AUSTRALIA,Victoria,Flinders,Surfing,Tom Burke,M,18,"2 lacerations on leg, each 4"" to 5"" long",N,18h00,1.8 m shark,"Sydney Morning Herald, 11/25/2005",nov,N
5626,1896,1896,Unprovoked,HAITI,Off Cape Haitien,Unknown,Wading,Syrian,M,15,Leg severed below knee,N,,,C. R. Baker & C.M. Rose; V.M. Coppleson (1958...,,N
410,19-Jun-2015,2015,Unprovoked,PUERTO RICO,Unknown,Off Cabo Rojo,Spearfishing,Benjamin Rios,M,36,Injury to hand,N,Morning,,"Yahoo News, 6/19/2015",jun,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,10-Oct-2016,2016,Unprovoked,USA,Oregon,"Indian Beach, Ecola State Park, Clatsop County",Surfing,Joseph Tanner,M,29,Wounds to upper thigh and lower leg,N,16h00,,"UP Beacon, 10/12/2016",oct,N
2243,17-Jul-1997,1997,Unprovoked,BRAZIL,Pernambuco,"Boa Viagem, Recife",Surfing,José Roberto Paraizo de Albuquerque,M,0,Survived,N,,,JCOnline,jul,N
5135,Reported 08-Jan-1927,1927,Unprovoked,USA,California,"Catalina Channel, Los Angeles County",Swimming,Price Taylor,M,0,Lost hand,N,,,"The Afro-American, 1/8/1927",jan,N
4290,25-Jul-1955,1955,Unprovoked,JAPAN,Okayama Prefecture,Usimado-no-Seto,Unknown,Hideo Ishida,M,22,FATAL,Y,13h00,Blue shark,M. Hosina,jul,N


In [65]:
#cheking the column "Age"
df_sharks["Age"].isna().sum()
df_sharks["Age"].unique()

array([        57,         11,         48,          0,         18,
               52,         15,         12,         32,         10,
               21,         34,         30,         60,         33,
               29,         54,         41,         37,         56,
               19,         25,         69,         38,         55,
               35,         46,         45,         14,         40,
               28,         20,         24,         26,         49,
               22,          7,         31,         17,         13,
               42,          3,          8,         50,         16,
               82,         73,         68,         51,         39,
               58,         47,         61,         65,         36,
               66,         43,          9,         72,         59,
                6,         27,         64,         23,         71,
               44,         62,         63,         70,         53,
               77,         74,       2826,          5,        

In [66]:
#the culumn age is not an integer, we would need to transform it
print(df_sharks.dtypes)

Date                      object
Year                      object
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                        int64
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
Month                     object
Provoked                  object
dtype: object


In [71]:
#replacing the values using regex to detect digits and drop the rest (aplhabetical or specia characters)
df_sharks["Age"] = df_sharks["Age"].str.replace(r'[^\d.]+', '')
#making sure to remove . as well
df_sharks["Age"] = df_sharks["Age"].str.replace('.', '')
#replacing "nan" value with a zero
df_sharks["Age"] = df_sharks["Age"].replace(np.nan, 0)
#replacing empty cells with a zero (empty strings)
df_sharks["Age"] = df_sharks["Age"].replace('', "0")


AttributeError: Can only use .str accessor with string values!

In [69]:
#like mentioned before, the column age is not an integer, transform into int64 (int64 to avoid overflow errors)
df_sharks["Age"] = df_sharks["Age"].astype("int64")

In [72]:
#now we have cells that contain more than 2 digits, which doesn't make sense for Age values
#any value above 99 is going to be replaced by a zero
df_sharks["Age"] = df_sharks["Age"].apply(lambda x: x if x < 99 else 0)

In [73]:
#now we can calculate the mean age for all values above 0
meanAge = df_sharks[df_sharks["Age"] > 0].mean()
meanAge = int(meanAge)


  meanAge = df_sharks[df_sharks["Age"] > 0].mean()


array([57, 11, 48,  0, 18, 52, 15, 12, 32, 10, 21, 34, 30, 60, 33, 29, 54,
       41, 37, 56, 19, 25, 69, 38, 55, 35, 46, 45, 14, 40, 28, 20, 24, 26,
       49, 22,  7, 31, 17, 13, 42,  3,  8, 50, 16, 82, 73, 68, 51, 39, 58,
       47, 61, 65, 36, 66, 43,  9, 72, 59,  6, 27, 64, 23, 71, 44, 62, 63,
       70, 53, 77, 74,  5, 86, 84, 75, 87, 67,  1, 81, 78,  2],
      dtype=int64)

In [74]:
#replacing all the columns with zero as value with the Mean Age
df_sharks["Age"] = df_sharks['Age'].apply(lambda x: x if x > 0 else meanAge)


In [75]:
#the mean value did not change
df_sharks["Age"].mean()
df_sharks["Age"].unique()


array([57, 11, 48, 27, 18, 52, 15, 12, 32, 10, 21, 34, 30, 60, 33, 29, 54,
       41, 37, 56, 19, 25, 69, 38, 55, 35, 46, 45, 14, 40, 28, 20, 24, 26,
       49, 22,  7, 31, 17, 13, 42,  3,  8, 50, 16, 82, 73, 68, 51, 39, 58,
       47, 61, 65, 36, 66, 43,  9, 72, 59,  6, 64, 23, 71, 44, 62, 63, 70,
       53, 77, 74,  5, 86, 84, 75, 87, 67,  1, 81, 78,  2], dtype=int64)

In [None]:
#now to the "Fatal (Y/N)" unify the elements in the cells and filling the nans with unknown

In [77]:
df_sharks["Fatal (Y/N)"].unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [78]:
#replacing all the values in the Fatal column in order to get cells only with values==> Y / N / Unknown

df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace(" N", "N")
df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace("N ", "N")
df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace("y", "Y")
df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace("2017", "UNKNOWN")
df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace("Unknown", "UNKNOWN")
df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].str.replace("M", "UNKNOWN")

df_sharks["Fatal (Y/N)"] = df_sharks["Fatal (Y/N)"].fillna("UNKNOWN")





In [79]:
df_sharks["Fatal (Y/N)"].value_counts()


N          4301
Y          1389
UNKNOWN     612
Name: Fatal (Y/N), dtype: int64

In [80]:
df_sharks.sample()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,Month,Provoked
4093,30-May-1959,1959,Provoked,SOUTH AFRICA,Eastern Cape Province,"Bird Island, Algoa Bay",Spearfishing,Tony Dicks,M,23,"No injury, diver shot shark & it bit his spear...",N,,"White shark, 2.7 m [9'], 280-lb","C. Middleton; M. Levine, GSAF",may,Y


In [81]:
print(df_sharks.columns)

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species',
       'Investigator or Source', 'Month', 'Provoked'],
      dtype='object')


In [82]:
#it seems like some accident weren't related to sharks, so i created a array of values and droped them from the table since
#our main focus is the Shark attacks

In [83]:
no_shark = ["Death may have been due to drowning","Doubtful / Unconfirmed attack / Unable to verify in local records","No shark invovlement","No shark involvement","No shark invovlement - it ws a publicity stunt",
"Not a shark attack; it was a hoax,Not authenticated","Shark involvement  not confirmed","Shark involvement unconfirmed","Shark involvement suspected but not confirmed","Shark involvement questionable","Shark involvement prior to deaths was not confirmed",
"Shark involvement prior to death was not confirmed","Shark involvement prior to deaths unconfirmed","Shark involvement prior to death unconfired","Shark involvement prior to death unconfirmed","Shark involvement prior to death still to be determined","Shark involvement prior to death suspected but not confirmed"
"Shark involvement prior to death remains unconfirmed","Shark involvement prior to death not confirmed","Shark involvement prior to death remains unconfirmed","Shark involvement prior to death not confirmed",
"Shark involvement not confirmed","thought to be a barracuda bite","Shark involvement not confirmed","injury may be due to a stingray","Shark involvement not confirmed","officials considered barracua","Shark involvement not confirmed",
"Shark involvement not confirmed & highly unlikely","Shark involvement highly doubtful,Shark involvement not cofirmed","No shark involvement","Shark involvement not confirmed"]

In [84]:
#dropping the rows not related to shark attacks
df_sharks = df_sharks[df_sharks['Species'].str.contains('|'.join(no_shark)) == False]

In [85]:
list(df_sharks["Species"])

['White shark',
 '2 m shark',
 'Tiger shark, 3m',
 'Tiger shark',
 "Lemon shark, 3'",
 "Bull shark, 6'",
 'Grey reef shark',
 'Invalid incident',
 'Tawny nurse shark, 2m',
 'Tiger shark',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'Tiger shark',
 'White shark, 2.5 m',
 "6' shark",
 'Juvenile bull shark',
 'Tiger shark',
 '2 m shark',
 'Bull shark',
 'White shark',
 "Tiger shark, 12'",
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 'Bull shark, 2 m',
 'Possibly a wobbegong',
 'Injury believed caused by an eel, not a shark',
 'Galapagos shark?',
 '2m shark',
 'Bull shark',
 'Bull shark, 3 m ',
 'Grey reef shark. 2 m',
 'White shark, 3.5 m',
 'small shark',
 'Wobbegong shark?',
 'Tiger shark',
 'Juvenile nurse shark',
 "Nurse shark. 5'",
 'Tiger shark, female',
 'Tiger shark, female',
 'Some drowned but other may have been killed by blue sharks',
 'White shark, 4.6 m',
 '2 m shark',
 'Tiger shark',
 'Cookiecutter

In [86]:
df_sharks.shape

(3145, 17)

In [87]:
#there were some injuries not related to shark attacks, they were dropped with the function above.

df_sharks["Injury"].value_counts()
check_i = df_sharks["Injury"].value_counts()
print(check_i.to_string())


FATAL                                                                                                                                                                                                     196
No injury                                                                                                                                                                                                  65
Foot bitten                                                                                                                                                                                                41
Leg bitten                                                                                                                                                                                                 39
Survived                                                                                                                                                                        

In [88]:
df_sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,Month,Provoked
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",jun,N
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,27,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",jun,N
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,27,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,jun,Y
6,03-Jun-2018,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",jun,N
7,27-May-2018,2018,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",may,N


In [89]:
df_sharks["Investigator or Source"].isna().sum()

6

In [90]:
#filling NaN investigation shource with Unknown
df_sharks["Investigator or Source"].fillna("Unknown", inplace=True)
df_sharks["Investigator or Source"].isna().sum()

0

In [91]:
df_sharks.to_csv("./data/final_data_shark.csv")