# SHARK QUEST
## New Kids On The Block



In [None]:
%load_ext autoreload
%autoreload 2 

In [None]:
# Import depedencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from cleaning import *

In [None]:
# Settings
pd.set_option('display.max_rows', 8000)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 100)

In [None]:
# Load the data
shark_attacks_df = pd.read_excel('GSAF5.xls')
shark_attacks = shark_attacks_df.copy()

countries_df = pd.read_csv('country_coord.csv')
countries = countries_df.copy()

shark_attacks.head()

In [None]:
# Format all columns names
shark_attacks.columns = [col.strip().replace(" ", "_").replace(".", "").lower() for col in shark_attacks.columns]

In [None]:
# Contract
data_schema = dictionnary_from_json('schema')

In [None]:
# Creating new columns
shark_attacks['severity'] = shark_attacks_df['Injury']

In [None]:
# Selecting columns
shark_attacks = shark_attacks[[col for col in data_schema]]

In [None]:
# Strip strings
shark_attacks = shark_attacks.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

In [None]:
# Handling missing values
shark_attacks.replace(['N/A', 'null', '--'], np.nan, inplace=True)

In [None]:
# Remove duplicates
shark_attacks.drop_duplicates(keep=False, inplace=True)

In [None]:
# Reset Index
shark_attacks.reset_index(inplace=True)

In [None]:
# Reformat values
replacements = dictionnary_from_json('replacements')

for col, values in replacements.items():
    shark_attacks[col] = shark_attacks[col].apply(
        replace_string_patterns, replacements=values)
    
uncaught = shark_attacks['severity'][~shark_attacks['severity'].isin(['INJURY','FATALITY'])] 
uncaught

In [None]:
# Reformat country
shark_attacks['country'] = shark_attacks['country'].apply(lambda x: x.lower() if isinstance(x, str) else x)
replace_to_nan(shark_attacks["country"], ["england","scotland"], "united kingdom")
replace_to_nan(shark_attacks["country"], ["usa", "hawaii"], "united states")
replace_to_nan(shark_attacks["country"], ["reunion"], "france")
replace_to_nan(shark_attacks["country"], ["columbia"], "colombia")
replace_to_nan(shark_attacks["country"], ["new guinea"], "papua new guinea")


# unify country, converts all low value "country" ocorrences into <NA>. Also sets Country as string-type
country_list = [x.lower() for x in countries['Country']]
shark_attacks["country"] = shark_attacks["country"].where(shark_attacks["country"].isin(country_list), np.nan)

In [None]:
# Clean dates
shark_attacks['date'] = pd.to_datetime(shark_attacks['date'], errors='coerce') #Drops unsavable mess
shark_attacks['date'] = shark_attacks['date'].dt.strftime('%d-%m-%Y') 
shark_attacks['date'] = shark_attacks['date'].ffill() #Fills forward to avoid time gaps

In [None]:
# Cast correct types
columns_types = {key: value['dtype'] for key, value in data_schema.items()}
shark_attacks = shark_attacks.astype(columns_types)

In [None]:
# Clean categories
for column in shark_attacks.select_dtypes(include=['category']).columns:
    shark_attacks[column] = shark_attacks[column].astype('string')
    shark_attacks[column] = pd.Categorical(shark_attacks[column], categories=set(data_schema[column]['categories']), ordered=True)
    shark_attacks[column] = shark_attacks[column].where(shark_attacks[column].isin(data_schema[column]['categories']), other=data_schema[column]['categories'][-1])
    shark_attacks[column] = shark_attacks[column].astype('category')


In [None]:
# Add helper columns
shark_attacks['severity_score'] = shark_attacks['severity'].apply(lambda x: 3 if x == 'FATALITY' else 2 if x == 'INJURY' else 1)

In [None]:
# Clean copy for analysis
shark_attacks_clean = shark_attacks.copy()
shark_attacks_clean

# ANALYSIS

In [None]:
#Hypothesis: Shark attacks are more concentrated in the PM 
time_stats = shark_attacks['time'].cat.remove_categories('UNKNOWN')

time_counts = time_stats.value_counts()
time_counts


df_time = pd.DataFrame(
    {'Time Category': time_counts.index,
     'Number of Attacks': time_counts.values  
    }
)
df_time
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x='Time Category', y='Number of Attacks', data=df_time, palette='Blues')
plt.title('Shark Attacks Concentration by Time of Day')
plt.xlabel('Time Category')
plt.ylabel('Number of Attacks')
plt.xticks(rotation=45)
plt.show()

#Conclusion: we could easily determine that shark attacks are most concentrated during NOON based on the dataset.

# HYPOTHESIS: SOME SHARK SPECIES ARE MORE DANGEROUS THAN OTHERS

In [None]:
#group by species and severity score, count occurences for each species
species_severity_count = shark_attacks_clean.groupby(['species', 'severity_score']).size().unstack(fill_value=0)

#drop MYSTERY SHARK row
species_severity_count = species_severity_count.drop("MYSTERY SHARK")

#reorder the columns
severity_order = [1, 2, 3]
species_severity_count = species_severity_count[severity_order]


#calculate the percentage
total_counts = species_severity_count.sum(axis=1)

percentage_severity = species_severity_count.div(total_counts, axis=0) * 100


#Filter out low incidence species
species_with_high_counts = total_counts[total_counts > 15].index

filtered_species = percentage_severity.loc[species_with_high_counts]

#convert to string so seaborn correctly filters out dropped species
filtered_species.index = filtered_species.index.astype(str)


In [None]:
#sort by occurence of FATALITY
sorted_by_category_3 = filtered_species.sort_values(by=3, ascending=False)

top_severity_3 = sorted_by_category_3.head(6)
top_severity_3

In [None]:
#MOST DANGEROUS SHARK SPECIES VISUALISATION

import seaborn as sns


top_severity_3 = top_severity_3.reset_index()

# Create Bar Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=top_severity_3[3], y=top_severity_3['species'], palette='viridis')

plt.title('Most Dangerous Shark Species!')
plt.xlabel('Severity Score 3 (%)')
plt.ylabel('Species')

plt.show()

In [None]:
#sort by occurence of NO-INJURY
top_severity_1 = filtered_species.sort_values(by=1, ascending=False)

#filter for lower than 5% fatality rate:
top_severity_1_notdeadly = top_severity_1[top_severity_1[3]<5].head(6)
display(top_severity_1_notdeadly)

In [None]:
#FRIENDLIEST SHARK SPECIES VISUALISATION

top_severity_1_notdeadly = top_severity_1_notdeadly.reset_index()


plt.figure(figsize=(10, 6))
sns.barplot(x=top_severity_1_notdeadly[1], y=top_severity_1_notdeadly['species'], palette='viridis')

plt.title('Friendliest Shark Species :)')
plt.xlabel('Severity Score 1 (%)')
plt.ylabel('Species')

plt.show()

# PLAN

In [None]:
# Source, PDF, Case Number as possible duplicate finder or year / date fill

# Henning : Date, Type
# Ricardo : Country, State
# Linh : Location, Activity
# Jp : Injury, Time

# Type : category : Merge some columns based on categories. Trim labels. Nan into invalid. Final => Provoked, Unprovoked, Invalid
# Date : datetime : Clean "Reported" - Harmonize Format - Cast weird into NaT. Final => Dates (as datetime), NaT
# Country : string : Strip spaces - Formatting - Replace weird characters - cast weird values as NaN => Strings, Nan
# State : string : Strip spaces - Formatting - Replace weird characters - cast weird values as NaN => Strings, Nan
# Location : string : Strip spaces - Formatting - Replace weird characters - cast weird values as NaN => Strings, Nan
# Activity : category : Merge some columns based on categories. Trim labels. Nan into invalid. Final => Few categories to be determined
# Injury : category : Merge columns based on keywords. Nan into other. Final => Fatality, Injury, Other
# Time : category : Cast into categories Final => morning (6-10) noon (10-14) afternoon( 14-18) dusk (18-22) night (22 - 2) dawn (2-6) maybe as integers (0-5)

# Todo
# Remove obvious duplicate (entire line) - Ricardo
# Remove fuzzy duplicates (case number? dates?) - Ricardo

# Functions :
# Merge categories : (*categories to be merged, target) - Henning
# Strip function : strips spaces - Linh
# Replace as Nan, Nat, ... function - Ricardo
# DONE - Replace by keyword function - Jp
# DONE - Cast to dateTime function
# Matching function (find similarities, keyword based?)
# DONE - Reformat dates, strings
# Filter function

## Selecting
- Select relevant columns
- Analyse relevant columns

## Cleaning :
- Cast to appropriate data types
    - General cleaning
        - identify duplicates
            - fuzzy
        - removing duplicates
            - remove
            - merge
        - handling null values
            - remove
            - replace
        - manipulating strings
        - formatting the data.

- Wrong inputs
- Outliers

### GENERAL CLEANING
1. Rename columns
2. Drop columns
3. Remove duplicates
    - Remove full dupes
    - Remove fuzzy search
4. Strip values
5. Reset Index

### SPECIFIC CLEANING
1. Search / Replace / Reformat strings
2. Merge categories
3. Cast to Null
4. Cast correct type
5. Create new columns


## Hypothesis

### TIME
- Shark attacks are seasonal (Summer)
- Shark attacks are increasing 
- Shark attacks are more concentrated in the PM 

### LOCATION
- Some countries are more attack prone (Australia)
- Some countries are more likely to be fatal (Australia)


### DEMOGRAPHICS
- Males are more likely to get attacked
- Males are more likely to get provoke a shark
- Provoked attacked are more fatal
- Young persons are more likely to get attacked
- Old persons are more likely to get killed

- Names more likely to get attacked (John)

### OTHER
- Some species are more aggressive (Tiger Shark)
- Some activities are more likely (Surfing)
- Some activities are more fatal

- Full moon? 😂


Retained :
- Shark Species have a gender preference (score) => JP => Heinning
- Some countries are more provocative against sharks* => Ricardo
- Shark attacks are more concentrated in the PM => Linh

- Names more likely to get attacked (John) => JP?

## Business Ideas

- App that gives a likelyhood of attack based on location and time
- Vacations far away from sharks for phobics
- Witness attacks for masochists

- Surf school at the safest places / seasons
- Fishing supplies => shark repellant by activities
- Safety training to avoid provocations / live in harmony

- Shark repellant => best spots
- Insurance for surfers, premiums for high risk areas