In [1]:
# Libraries:

import pandas as pd
import re 
import numpy as np

In [2]:
df_attacks = pd.read_csv ('./data/attacks.csv', encoding='unicode_escape')
df_attacks.sample(5)

# print(df_attacks.isna().sum())
# df_attacks.shape


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
18351,,,,,,,,,,,...,,,,,,,,,,
5035,1930.09.26.R,Reported 26-Sep-1930,1930.0,Unprovoked,HONDURAS,Black River,,Swimming,Indian guide,M,...,,"NY Times, 9/26/1930",1930.09.26.R-IndianGuide.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1930.09.26.R,1930.09.26.R,1268.0,,
14410,,,,,,,,,,,...,,,,,,,,,,
11029,,,,,,,,,,,...,,,,,,,,,,
10361,,,,,,,,,,,...,,,,,,,,,,


In [3]:
df_attacks.dropna(axis=0, inplace=False, how="all")

#print(df_attacks.isna().sum())

df_attacks["Investigator or Source"].sample(15)

# Checking the columns I realized everything below Species doesn't seem to be relevant.
# I'll create a copy with only the data I intend to analyse

df_attacks_2 = df_attacks.drop(['Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], axis=1)


In [4]:
# I realized many columns had all NaN values but the number "0" in the Case Number column
# I used the dropna combined with a threshold of 2 to delete these columns since they had no information. 
# The way the threshold works is that at least 2 values in the row have to be not null in order for the row not to be deleted.

df_attacks_2.dropna(axis=0, inplace=True, thresh=2)

df_attacks_2.head(3)


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,


In [5]:
# Now to extract the Year and the Month the attacks happened so I can look for a pattern depending on the country/region

df_attacks_2["Date"].unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1900-1905',
       '1883-1889', '1845-1853'], dtype=object)

In [6]:
# Cleaning the Date so I have Months and Years in different columns
# The original Year column had over 800 NaNs even when the year was explicitly shown in the Date column
# To solve this I used regex to gather the months from the Date column and went to 19 NaNs instead

df_attacks_2[["Month"]] = df_attacks_2["Date"].str.lower().str.extract(r'-(\w{3})-')
df_attacks_2[["Year"]] = df_attacks_2["Date"].str.extract(r'(\d{4})')


df_attacks_2['Month'].isna().sum()
df_attacks_2['Year'].isna().sum()

df_attacks_2.drop(['Case Number'], axis=1, inplace=True)

# print(df_attacks_2['Year'])
# print(df_attacks_2['Date'])

In [7]:
print(df_attacks_2.isna().sum())

df_attacks_2.shape

# Here I can have some insight on which datapoints might be worth exploring in order to formulate a hypothesis
# Datapoints with a large percentage of NaNs will be difficult to draw conclusions from


Date              0
Year             19
Type              4
Country          50
Area            455
Location        540
Activity        544
Name            210
Sex             565
Age            2831
Injury           28
Fatal (Y/N)     539
Time           3354
Species        2838
Month           910
dtype: int64


(6302, 15)

In [8]:
# Checking the size of the dataframe and the unique entries for the categories Activity, Type and Area

# print("\n\nNumber of rows and columns", df_attacks_2.shape)

# print("\n\nUnique descriptions for Type", df_attacks_2['Type'].value_counts())
# print("\n\n\nUnique descriptions for Activity", df_attacks_2['Activity'].value_counts().sum())
# print("\n\n\nUnique descriptions for Area", df_attacks_2['Area'].value_counts())
# print("\n\n\nUnique descriptions for Country", df_attacks_2['Country'].value_counts())
# print("\n\n\nUnique descriptions for Time", df_attacks_2['Time'].value_counts())

In [9]:



# Analysing the time patterns to see if they fit the "traditional" category or the freelancing one

def time_habits (time):  
    
    try: 
        if 8 <= int(time) <= 18: 
            return "Trad."
        elif 19 <= int(time) <= 7:
            return "Freela."
    
    except Exception: 
        
        if ('noon' in str(time).lower()) or ('morning' in str(time).lower()) or ('evening' in str(time).lower()) or ('dusk' in str(time).lower()): 
            return "Trad."
    
        elif ('midday' in str(time).lower()) or ('sunset' in str(time).lower()) or ('a.m' in str(time).lower()): #broken for readability
            return "Trad."
        
        elif ('night' in str(time).lower()) or ('p.m' in str(time).lower()):
            return "Freela."
        
        else:
            return "Unknown"
    
    

df_attacks_2['Time'] = df_attacks_2['Time'].str.replace(r"(h\d\w*)", r"", regex=True) # Standardizing the time to only have the "hour" digits


df_attacks_2["Traditional or Freelance?"] = df_attacks_2['Time'].apply(lambda x: (time_habits(x)))




In [10]:
print("\n\n\nUnique descriptions for Traditional or Freelance\n\n", df_attacks_2['Traditional or Freelance?'].value_counts())

# print("\n\n", df_attacks_2['Time'].unique(), "\n\n")




Unique descriptions for Traditional or Freelance

 0          3438
Trad.      2526
Freela.      84
Name: Traditional or Freelance?, dtype: int64


In [25]:
Prefered_Countries = []


Prefered_Countries = df_attacks_2['Country'].value_counts()[:5].index.tolist()

df_attacks_2['Attacks_country'] = df_attacks_2.groupby('Country')['Country'].transform('count')

Total_prefered_countries = int(df_attacks_2['Attacks_country'].value_counts().head(5).sum())    

print("Top 5 countries with most attacks:", Prefered_Countries, "\n\nTotal number of attacks in these countries:", Total_prefered_countries)
print("\nTotal number of attacks:", df_attacks_2['Country'].value_counts().sum())


Top 5 countries with most attacks: ['USA', 'AUSTRALIA', 'SOUTH AFRICA', 'PAPUA NEW GUINEA', 'NEW ZEALAND'] 

Total number of attacks in these countries: 4408

Total number of attacks: 6252
