In [1]:
# Libraries:

import pandas as pd
import re 

In [2]:
df_attacks = pd.read_csv ('./data/attacks.csv', encoding='unicode_escape')
df_attacks.sample(5)

# print(df_attacks.isna().sum())
# df_attacks.shape


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
15715,,,,,,,,,,,...,,,,,,,,,,
5941,1864.08.12,12-Aug-1864,1864.0,Unprovoked,USA,New York,Mahattan,Swimming,Henry Brice,M,...,,"NY Times, 8/13/1864",1864.08.12-Brice.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1864.08.12,1864.08.12,362.0,,
17408,,,,,,,,,,,...,,,,,,,,,,
2765,1987.10.11,11-Oct-1987,1987.0,Unprovoked,SOUTH AFRICA,Western Cape Province,"Seal Island, False Bay",Spearfishing,Dawid Smit,M,...,White shark,"G. Smit, P. Landsberg, M.D., M. Levine, GSAF",1987.10.11-DawidSmit.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1987.10.11,1987.10.11,3538.0,,
16308,,,,,,,,,,,...,,,,,,,,,,


In [3]:
df_attacks.dropna(axis=0, inplace=False, how="all")

#print(df_attacks.isna().sum())

df_attacks["Investigator or Source"].sample(15)

# Checking the columns I realized everything below Species doesn't seem to be relevant.
# I'll create a copy with only the data I intend to analyse

df_attacks_2 = df_attacks.drop(['Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], axis=1)


In [4]:
# I realized many columns had all NaN values but the number "0" in the Case Number column
# I used the dropna combined with a threshold of 2 to delete these columns since they had no information. 
# The way the threshold works is that at least 2 values in the row have to be not null in order for the row not to be deleted.

df_attacks_2.dropna(axis=0, inplace=True, thresh=2)

df_attacks_2.head(3)


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,


In [5]:
# Now to extract the Year and the Month the attacks happened so I can look for a pattern depending on the country/region

df_attacks_2["Date"].unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1900-1905',
       '1883-1889', '1845-1853'], dtype=object)

In [104]:
# Cleaning the Date so I have Months and Years in different columns
# The original Year column had over 800 NaNs even when the year was explicitly shown in the Date column
# To solve this I used regex to gather the months from the Date column and went to 19 NaNs instead

df_attacks_2[["Month"]] = df_attacks_2["Date"].str.lower().str.extract(r'-(\w{3})-')
df_attacks_2[["Year"]] = df_attacks_2["Date"].str.extract(r'(\d{4})')


df_attacks_2['Month'].isna().sum()
df_attacks_2['Year'].isna().sum()

#df_attacks_2.drop(['Case Number'], axis=1, inplace=True)

print(df_attacks_2['Year'])
print(df_attacks_2['Date'])

0       2018
1       2018
2       2018
3       2018
4       2018
        ... 
6297    1903
6298    1903
6299    1900
6300    1883
6301    1845
Name: Year, Length: 6302, dtype: object
0       25-Jun-2018
1       18-Jun-2018
2       09-Jun-2018
3       08-Jun-2018
4       04-Jun-2018
           ...     
6297    Before 1903
6298    Before 1903
6299      1900-1905
6300      1883-1889
6301      1845-1853
Name: Date, Length: 6302, dtype: object


In [44]:
print(df_attacks_2.isna().sum())

df_attacks_2.shape

# Here I can have some insight on which datapoints might be worth exploring in order to formulate a hypothesis
# Datapoints with a large percentage of NaNs will be difficult to draw conclusions from


Date              0
Year             19
Type              4
Country          50
Area            455
Location        540
Activity        544
Name            210
Sex             565
Age            2831
Injury           28
Fatal (Y/N)     539
Time           3354
Species        2838
Month           910
dtype: int64


(6302, 15)

In [45]:
# Checking the size of the dataframe and the unique entries for the categories Activity, Type and Area

print("\n\nNumber of rows and columns", df_attacks_2.shape)

print("\n\nUnique descriptions for Type", df_attacks_2['Type'].value_counts())
print("\n\n\nUnique descriptions for Activity", df_attacks_2['Activity'].value_counts().sum())
print("\n\n\nUnique descriptions for Area", df_attacks_2['Area'].value_counts())
print("\n\n\nUnique descriptions for Country", df_attacks_2['Country'].value_counts())
print("\n\n\nUnique descriptions for Time", df_attacks_2['Time'].value_counts())



Number of rows and columns (6302, 15)


Unique descriptions for Type Unprovoked      4595
Provoked         574
Invalid          547
Sea Disaster     239
Boating          203
Boat             137
Questionable       2
Boatomg            1
Name: Type, dtype: int64



Unique descriptions for Activity 5758



Unique descriptions for Area Florida                                 1037
New South Wales                          486
Queensland                               311
Hawaii                                   298
California                               290
                                        ... 
Ysabel Island                              1
 Lau Province                              1
South Coast, East New Britain              1
Between Southampton & Canary Islands       1
Moala Island                               1
Name: Area, Length: 825, dtype: int64



Unique descriptions for Country USA                       2229
AUSTRALIA                 1338
SOUTH AFRICA               579
PA

In [121]:
# Analysing the time patterns to see if they fit the 9-5 category or the freelancing one

def time_habits (time):  
    
    try: 
        if 8 <= time <= 18: 
            return "T"
        elif 19 < time < 7:
            return 'F'
    
    except: 
        
        if time == 'Afternoon' or time == 'Morning':
            return "T"
    
        elif time == 'Night':
            return "F"
        
        else:
            return time
    
# df_attacks_2[['Time2']] = df_attacks_2[['Time']]
# df_attacks_2['Time2'].head()

# df_attacks_2[["Time2"]] = df_attacks_2["Time"].str.lower().str.extract(r'(\d{2})h\d{2}') # Standardizing the time to only have the "hour" digits
# df_attacks_2[["Time3"]] = df_attacks_2["Time"].str.lower().str.extract(r'([a-zA-Z\s]{5,})')
# print(df_attacks_2['Time2'])
# print(df_attacks_2['Time3'])



df_attacks_2 = df_attacks_2.astype({"Time2": 'str'}, {"Time3": 'str'}, errors='raise') 
df_attacks_2['Time2'].dtypes

# df_attacks_2["Time_habits"] = df_attacks_2[["Time2", "Time3"]].apply("".join, axis=1)
# print(df_attacks_2['Time_habits'].value_counts())

# df_attacks_2["Traditional or Freelance?"] = df_attacks_2['Time2'].apply(lambda x: (time_habits(x)))

# print("\n\n\nUnique descriptions for Time", df_attacks_2['Traditional or Freelance?'].value_counts())

dtype('O')