In [1]:
import pandas as pd
df = pd.read_excel('https://raw.githubusercontent.com/Joseba-Bermejo/shark_attacks_quest2/main/data_for_analysis.xlsx')

## Business Case

A travel insurance provider seeks to refine its risk models by identifying patterns in fatal shark attacks based on activity type, region, and victim profile.

### Hypothesis: 
“In the last 25 years shark attacks are more likely to result in fatalities when they are unprovoked" [Date, Fatal Y/N, Type]

"Australia is the area with more fatal shark attacks in victims doing underwater activities" [Country, Activity, Fatal Y/N]

"Among male shark attack victims, those aged 25 or younger are more likely to suffer fatal outcomes." [age, gender, Fatal Y/N]

In [4]:
df

Unnamed: 0,year,type,country,activity,sex,age,fatal
0,2025.0,Unprovoked,Australia & Oceania,Swimming,M,30.0,N
1,2025.0,Unprovoked,Australia & Oceania,Surfing,M,37.0,Y
2,2025.0,Unprovoked,Australia & Oceania,Swimming,F,56.0,N
3,2025.0,Unprovoked,Australia & Oceania,Swimming,M,40.0,N
4,2025.0,Questionable,Other,Diving,M,29.0,Y
...,...,...,...,...,...,...,...
6990,0.0,Unprovoked,Australia & Oceania,Diving,M,,Y
6991,0.0,Unprovoked,Australia & Oceania,Diving,M,,Y
6992,0.0,Unprovoked,North America,Swimming,M,,Y
6993,0.0,Unprovoked,Other,Unknown,M,,Y


In [36]:
# 3rd Hypothesis check

# Filtering for males who have a know age and fatalities that are not marked as unknown

df3_clean = df[(df["fatal"] == "Y") | (df["fatal"] == "N")]
df3_male = df3_clean[(df3_clean["sex"] == "M") & (df3_clean["age"].notnull())]
df3_male

Unnamed: 0,year,type,country,activity,sex,age,fatal
0,2025.0,Unprovoked,Australia & Oceania,Swimming,M,30.0,N
1,2025.0,Unprovoked,Australia & Oceania,Surfing,M,37.0,Y
3,2025.0,Unprovoked,Australia & Oceania,Swimming,M,40.0,N
4,2025.0,Questionable,Other,Diving,M,29.0,Y
6,2025.0,Questionable,Australia & Oceania,Fishing,M,58.0,Y
...,...,...,...,...,...,...,...
6936,0.0,Unprovoked,Middle East,Swimming,M,16.0,N
6947,0.0,Provoked,Caribbean,Fishing,M,50.0,Y
6958,0.0,Unprovoked,Middle East,Swimming,M,13.0,Y
6969,0.0,Unprovoked,Caribbean,Standing,M,16.0,Y


In [38]:
# Create two groups for males who are 25 years or younger and more than 25 years old

df3_male["age_group"] = df3_male["age"].apply(lambda x: "≤25" if x <= 25 else ">25")
df3_male["age_group"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3_male["age_group"] = df3_male["age"].apply(lambda x: "≤25" if x <= 25 else ">25")


0       >25
1       >25
3       >25
4       >25
6       >25
       ... 
6936    ≤25
6947    >25
6958    ≤25
6969    ≤25
6994    ≤25
Name: age_group, Length: 3158, dtype: object

In [28]:
# to check fatalities per age group

df3_male.groupby("age_group")["fatal"].value_counts()

age_group  fatal
>25        N        1199
           Y         274
≤25        N        1352
           Y         333
Name: count, dtype: int64

In [34]:
# Make a pivot table out of it

df3_pivot = df3_male.pivot_table(index="age_group", columns="fatal", aggfunc="size", fill_value=0)
df3_pivot

fatal,N,Y
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
>25,1199,274
≤25,1352,333


In [51]:
df3_pivot.describe()

fatal,N,Y
count,2.0,2.0
mean,1275.5,303.5
std,108.187338,41.7193
min,1199.0,274.0
25%,1237.25,288.75
50%,1275.5,303.5
75%,1313.75,318.25
max,1352.0,333.0


### Conclusion

The data does not support the hypothesis.

Younger males (≤25) experience more shark attacks overall compared to older males (>25), both fatal and non-fatal. However, when it comes to fatality rate (the proportion of fatal outcomes compared to the total attacks) is very similar in both groups (around 23-24%).

In [58]:
# 2nd hypothesis check: Australia is the area with more fatal shark attacks in victims who are surfing.

# Filter for fatal attacks
df_fatal = df[df["fatal"] == "Y"]

In [60]:
# Filter for surfing within "activity"
df_surfing_fatal = df_fatal[df_fatal["activity"] == "Surfing"]

In [62]:
# Count fatal surfing attacks by country
fatal_surf_by_country = df_surfing_fatal["country"].value_counts()
print(fatal_surf_by_country)

country
Australia & Oceania    35
North America          21
Europe                 12
Africa                 12
South America           4
Other                   3
Asia                    2
Name: count, dtype: int64


In [74]:
# Redifinition of hypothesis: 
# Surfing-related shark attacks in Australia are more likely to result in fatal outcomes compared to other regions.

In [72]:
# 1. Filter surfing-related rows
df_surfing = df[df["activity"] == "Surfing"]

# 2. Filter only fatal outcomes (Y/N), exclude unknowns
df_surfing_clean = df_surfing[df_surfing["fatal"].isin(["Y", "N"])]

# 3. Create total surfing attacks per country
total_surf_by_country = df_surfing_clean["country"].value_counts()

# 4. Create fatal surfing attacks per country
fatal_surf_by_country = df_surfing_clean[df_surfing_clean["fatal"] == "Y"]["country"].value_counts()

# 5. Combine into a DataFrame
surf_stats = pd.DataFrame({
    "Total": total_surf_by_country,
    "Fatal": fatal_surf_by_country
})

# 6. Fill missing fatal values with 0, and calculate ratio
surf_stats["Fatal"] = surf_stats["Fatal"].fillna(0)
surf_stats["Fatality Rate (%)"] = (surf_stats["Fatal"] / surf_stats["Total"]) * 100

# 7. Sort by Fatality Rate
surf_stats = surf_stats.sort_values(by="Fatality Rate (%)", ascending=False)

# Display
surf_stats

Unnamed: 0_level_0,Total,Fatal,Fatality Rate (%)
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Other,9,3.0,33.333333
Europe,42,12.0,28.571429
Asia,12,2.0,16.666667
Australia & Oceania,321,35.0,10.903427
South America,47,4.0,8.510638
Africa,142,12.0,8.450704
North America,786,21.0,2.671756
Caribbean,4,0.0,0.0
Middle East,1,0.0,0.0
Unknown,1,0.0,0.0


### Conclusion

The data does not supports the hypothesis.

Although Australia & Oceania has the highest number of surfing-related shark attacks, it ranks fourth in fatality rate, behind Europe, Asia, and "Other". This suggests that fatal outcomes are proportionally more common in those regions despite fewer total incidents.