In [143]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.font_manager
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
import plotly.express as px
from scipy import stats
import plotly.graph_objects as go

In [None]:
from google.colab import drive
drive.mount ('/content/drive')

In [None]:
%load_ext google.colab.data_table

In [None]:
shark_attack = pd.read_csv("/content/drive/MyDrive/RawData/attacks.csv", encoding='latin-1')
shark_attack.loc[0:,'Date':'Species ']

In [None]:
# columns of my dataset
shark_attack.columns

In [148]:
shark_attack.rename(columns={'Species ': 'Species'}, inplace=True)

In [None]:
species = shark_attack ['Species'].unique()
for s in species:
    print(s)

In [150]:
# Create a List of known shark species
list_of_species = ["White shark", "Tiger shark", "Lemon shark", "Bull shark", "Grey reef shark", "Tawny nurse shark", "Wobbegong shark", "Blacktip shark", "Galapagos shark", "Oceanic whitetip shark", "Cookiecutter shark", "Spinner shark", "Angel shark", "Dogfish shark", "Bronze whaler shark", "Reef shark", "Hammerhead shark", "Thresher shark", "Spurdog", "Lesser spotted dogfish", "Longfin mako shark", "Shortfin mako shark", "Goblin shark", "Soupfin shark", "Leopard shark", "Porbeagle", "Seven-gill shark", "Raggedtooth shark", "Sandtiger shark", "Cow shark", "Whitetip reef shark", "Salmon shark"]

In [151]:
# Remove rows whose "Species" is not in the list of shark species
shark_attack_species = shark_attack[shark_attack["Species"].isin(list_of_species)]

In [None]:
# Question 1: What are the most dangerous types of sharks to humans?
# Assuming the sharks with most attacks (and the distinction between fatal and not-fatal) are the ones who are most dangerous to people.

# species that attack most
species_attack = shark_attack_species.groupby('Species')['Species'].count().sort_values(ascending=False)[0:20]

# using column 'Fatal'
fatal_counts = shark_attack_species.groupby('Species')['Fatal (Y/N)'].value_counts().unstack(fill_value=0)

# Sort by total number of attacks in descending order
fatal_counts = fatal_counts.loc[species_attack.index]

# Replace 'Y' and 'N' labels to 'Fatal' and 'Not Fatal'
fatal_counts.rename(columns={'Y': 'Fatal', 'N': 'Not Fatal'}, inplace=True)

fig = px.bar(fatal_counts, x=fatal_counts.index, y=['Not Fatal', 'Fatal'], title='Shark Attacks by Species and Fatality',
             labels={'index': 'Species', 'value': 'Number of Attacks'}, height=400)

fig.show()

In [None]:
# Question 2: Are children more likely to be attacked by sharks?

# Unique values Age column
shark_attack['Age'].unique()

In [154]:
# Create a function to extract numeric values from a string
def extract_numeric_age(age_str):
    try:
        # Try to extract a numeric value from the string
        age = float(age_str)
        return age
    except ValueError:
        # If it fails, return NaN
        return np.nan

In [155]:
# Apply the function to the "Age" column and create "NumericAge"
shark_attack['NumericAge'] = shark_attack['Age'].apply(extract_numeric_age)
# Filter the rows in which numerical ages are present
shark_attack = shark_attack.dropna(subset=['NumericAge'])

In [None]:
# Unique values Age column
shark_attack['Age'].unique()

In [None]:
# To answer the question, a column has been added to distinguish between shark attacks on children (in this case under 18 years old) and adults (18 years and older).
# Adding column Child

shark_attack['Child'] = shark_attack['Age'] < '18'

# For visualization purposes, adjust the position of the column 'Child'
shark_attack.insert(11, 'Child', shark_attack.pop('Child'))
shark_attack.loc[0:,'Date':'Child']

In [None]:
# Ultimately, the graph shows that the number of attacks on children is much lower than the number of attacks on people aged 18 and over.

# Create a bar chart of the 'Child' column
attack_on_children = shark_attack['Child'].value_counts()
attack_on_children.plot(kind='bar')

# Labels graph
plt.title('Number of attacks on children')
plt.xlabel('Attacks on Child')
plt.ylabel('Number of attacks')
plt.xticks([0, 1], ['False', 'True'])

plt.show()

In [None]:
# Question 3: Are shark attacks where sharks were provoked more or less dangerous?
# I want to see a total number of attacks in my line graph and add the numbers of provoked attacks and unprovoked attacks to see a clear difference.
# In the graph you can see a big difference between unprovoced attacks and provoced attacks.
# Given the large difference between provoked and unprovoked attacks, I would assume that even if you don't provoke an attack, you still have a high chance of being attacked by a shark.
# Not to mention the fact that I wouldn't provoke a shark to attack anyway.

# To make the graph easier to read, only data from 1800 onwards is shown. Few attacks took place before this.
shark_attack = shark_attack[shark_attack['Year'] >= 1800]

# Calculate the number of 'Provoked' attacks per year
provoked_attacks = shark_attack[shark_attack['Type'] == 'Provoked']['Year'].value_counts().sort_index()

# Calculate the number of 'Unprovoked' attacks per year
unprovoked_attacks = shark_attack[shark_attack['Type'] == 'Unprovoked']['Year'].value_counts().sort_index()

# Calculate the total number of attacks (Provoked + Unprovoked) per year
total_attacks = provoked_attacks + unprovoked_attacks

# Make a line graph
plt.figure(figsize=(20, 6))

plt.plot(total_attacks.index, total_attacks.values, 'r-', label='Total Attacks')
plt.plot(provoked_attacks.index, provoked_attacks.values, 'y-', label='Provoked attacks')
plt.plot(unprovoked_attacks.index, unprovoked_attacks.values, 'g-', label='Unprovoked attacks')

plt.xlabel('Year')
plt.ylabel('Total Attacks')
plt.title('Attacks since 1800')

plt.legend()

plt.show()

In [None]:
# Unique values Activity column
shark_attack['Activity'].unique()

In [None]:
# Question 4: Are certain activities more likely to result in a shark attack?
# I want to create a list of common activities that I can extract from the large activity dataset
# Once I have done this I want to sort the activity with the most attacks from high to low and show this in the graph

# Make a list of activities that are common in the list
activities = ['Swimming', 'Surfing', 'Fishing', 'Diving', 'Wading', 'Snorkeling', 'Body boarding', 'Boogie boarding', 'Paddle-boarding', 'Kayaking', 'Paddle-skiing', 'Kite surfing', 'Paddle boarding', 'Lobstering', 'Kiteboarding', 'Spearfishing', 'Rowing']

# Create an empty dictionary to store the counts
activity_counts = {}

# Go through the list of activities and count how many times each activity occurs
for activity in activities:
    # Use str.contains to find the matches and count them
    count = shark_attack['Activity'].str.contains(activity, case=False, na=False).sum()
    # Save the number in the dictionary
    activity_counts[activity] = count

# Convert the dictionary to a pandas Series for easy viewing
activity_counts_series = pd.Series(activity_counts)

plt.figure(figsize=(12, 6))
activity_counts_series.sort_values(ascending=False).plot(kind='bar', color='lightblue')
plt.title('Number of shark attacks per activity')
plt.xlabel('Activity')
plt.ylabel('Number of attacks')
plt.xticks(rotation=45)
plt.show()
