In [229]:
# Initial Setup:
import pandas as pd
import numpy as np
import re

url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
df = pd.read_excel(url)

In [230]:
# Preview of raw DataFrame:
print("head:")
print(df.head())

print("\ninfo:")
df.info()

print("\ndescribe:")
print(df.describe())

print("\ncolumns:")
print(df.columns.tolist())

head:
          Date    Year        Type    Country              State  \
0  15 Mar 2024  2024.0  Unprovoked  AUSTRALIA         Queensland   
1  04 Mar 2024  2024.0  Unprovoked        USA             Hawaii   
2  02 Mar-2024  2024.0  Unprovoked        USA             Hawaii   
3  25 Feb-2024  2024.0  Unprovoked  AUSTRALIA  Western Australia   
4  14 Feb-2024  2024.0  Unprovoked      INDIA        Maharashtra   

                           Location  Activity                 Name Sex  Age  \
0                     Bargara Beach  Swimming       Brooklyn Sauer   F   13   
1                Old Man's, Waikiki   Surfing        Matthew White   M  NaN   
2                    Rainbows, Oahu  Swimming                  NaN   F   11   
3        Sandlnd Island, Jurian Bay       NaN               female   F   46   
4  Vaitarna River, Palghar District   Fishing  Vicky Suresh Govari   M   32   

   ...        Species                      Source  pdf href formula href  \
0  ...     Tiger shark      Yahoo 

In [231]:
# DataFrame cleaning preparation:

# 1. Dropping unneeded columns and duplicates:
columns_to_drop = ["Source", "Location", "Injury", "Name", "pdf", "href formula",
                   "href", "Case Number", "Case Number.1", "original order",
                   "Unnamed: 21", "Unnamed: 22", "Species "]

df = df.drop(columns=columns_to_drop).drop_duplicates().reset_index(drop=True)

# 2. Filter rows with Year > 1800 and remove unneeded 'Types':
df = df[df["Year"] > 1800]

# 3. Types of shark attacks to exclude:
undesired_types = ["Questionable", "Boat", "Provoked", "Provoked ", "?",
                   "Unverified", "Under investigation", "Unconfirmed"]

df = df[~df["Type"].isin(undesired_types)]

# 4. Renaming and reformatting columns:
df.columns = [col.strip().replace(" ", "_").replace(".", "").lower() for col in df.columns]
df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)

# 5. Creating a copy of the original DataFrame for further manipulation
df_copy = df.copy()

# Check:
print(df.columns.tolist())

['date', 'year', 'type', 'country', 'state', 'activity', 'sex', 'age', 'fatal', 'time']


In [232]:
# Cleaning 'fatal' column:
value_map = {'n': 'n', 'y': 'y'}

df_copy['fatal'] = df_copy['fatal'].str.strip().str.lower().map(value_map)

# Calculate mode:
fatal_mode = df_copy['fatal'].mode()[0]

# Replace NaNs with 'n':
df_copy['fatal'] = df_copy['fatal'].fillna(fatal_mode)

# Check:
df_copy['fatal'].value_counts(dropna=False)

fatal
n    4740
y    1344
Name: count, dtype: int64

In [233]:
# Cleaning 'sex' column:
df_copy['sex'] = df_copy['sex'].str.strip().str.lower()

# Replace wrong values with 'unknown':
invalid_entries = ['lli', 'm x 2', 'n', '.']
for entry in invalid_entries:
    df_copy['sex'] = df_copy['sex'].replace(entry, 'unknown')

# Replace missing values with 'unknown':
df_copy['sex'].fillna('unknown')

# Check:
df_copy['sex'].value_counts(dropna=False)

sex
m          4856
f           729
NaN         495
unknown       4
Name: count, dtype: int64

In [234]:
#Cleaning 'type' column:

# Remove 'invalid' values:
df_copy = df_copy[df_copy['type'] != 'invalid']

# Replace 'NaN' with 'unknown':
df_copy['type'].fillna('unknown')

# Check:
df_copy['type'].value_counts(dropna=False)

type
Unprovoked      4937
Invalid          546
Watercraft       349
Sea Disaster     234
NaN               16
 Provoked          2
Name: count, dtype: int64

In [235]:


# 1. Convert 'time' column to strings (to avoid issues with NaN when splitting)
df_copy['time'] = df_copy['time'].astype(str)

# 2. Function to validate if time is in the correct "hhmm" format (e.g., "16h30")
def validate_time_format(time_str):
    # Regex pattern for valid time formats like "16h00", "01h50", etc.
    pattern = r'^\d{2}h\d{2}$'
    if re.match(pattern, time_str):
        return time_str
    else:
        return None  # Invalid format, will replace with mean later

# 3. Apply the validation function and keep only valid times
df_copy['time_numeric'] = df_copy['time'].apply(lambda x: x.replace('h', '') if validate_time_format(x) else None)

# 4. Convert the 'time_numeric' to numeric, replacing invalid parsing with None
df_copy['time_numeric'] = pd.to_numeric(df_copy['time_numeric'], errors='coerce')

# 5. Calculate the mean time, ignoring NaNs
mean_time = df_copy['time_numeric'].mean()

# 6. Fill NaN values (both invalid format and actual NaNs) with the calculated mean time
df_copy['time_numeric'].fillna(round(mean_time), inplace=True)

# 7. Convert all values in 'time_numeric' to integers
df_copy['time_numeric'] = df_copy['time_numeric'].astype(int)

# Output the updated dataframe
print(df)


                      date    year          type    country  \
0              15 Mar 2024  2024.0    Unprovoked  AUSTRALIA   
1              04 Mar 2024  2024.0    Unprovoked        USA   
2              02 Mar-2024  2024.0    Unprovoked        USA   
3              25 Feb-2024  2024.0    Unprovoked  AUSTRALIA   
4              14 Feb-2024  2024.0    Unprovoked      INDIA   
...                    ...     ...           ...        ...   
6737              Sep-1805  1805.0       Invalid        USA   
6738  Reported 26-Feb-1804  1804.0    Watercraft  AUSTRALIA   
6739           May-17-1803  1803.0  Sea Disaster        USA   
6740              Mar-1803  1803.0    Unprovoked  AUSTRALIA   
6741  Reported Apr-13-1802  1802.0    Unprovoked      INDIA   

                  state  activity  sex  age fatal   time  
0            Queensland  Swimming    F   13   NaN  16h00  
1                Hawaii   Surfing    M  NaN     N    NaN  
2                Hawaii  Swimming    F   11     N  13h30  
3     W

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['time_numeric'].fillna(round(mean_time), inplace=True)


In [236]:
# Cleaning 'age' column:

# Converting 'age' to numeric, setting errors to NaN:
df_copy['age'] = pd.to_numeric(df_copy['age'], errors='coerce')

# Calculating the mean age, excluding NaNs:
age_mean = df_copy['age'].mean()
print(f"Mean age: {age_mean}")

# Filling missing 'age' values with the calculated mean:
df_copy['age'].fillna(age_mean)

# Rounding 'age' values to nearest whole number:
df_copy['age'] = df_copy['age'].round(0)

# Check:
df_copy['age'].value_counts(dropna=False)

Mean age: 27.718184429761564


age
NaN     2603
17.0     167
18.0     146
15.0     142
16.0     140
        ... 
72.0       1
86.0       1
84.0       1
87.0       1
81.0       1
Name: count, Length: 82, dtype: int64

In [237]:
# Subsequent overview of v0.5:
# Cell to be (re)moved or updated as we refine our analysis and finalize other KPIs.

print("head:")
print(df_copy.head())

print("\ninfo:")
df_copy.info()

print("\ndescribe:")
print(df_copy.describe())

print("\ncolumns:")
print(df_copy.columns.tolist())

print("\nmissing values:")
print(df_copy.isnull().sum()[df_copy.isnull().sum() > 0])

# Pending Cleaning Columns: 'date', 'year', 'activity', 'country', 'state'.
# Note on Geo Data: Considering merging 'country' and 'state' into 'location' to preserve data and enhance precision.

# Upcoming in v0.6:
# - Cleaning for 'date', 'year', 'type' and their integration.
# - Decision pending on creating a 'location' column and its integration.

# Uniform Commentary: Please keep documentation clear and accessible.

head:
          date    year        type    country              state  activity  \
0  15 Mar 2024  2024.0  Unprovoked  AUSTRALIA         Queensland  Swimming   
1  04 Mar 2024  2024.0  Unprovoked        USA             Hawaii   Surfing   
2  02 Mar-2024  2024.0  Unprovoked        USA             Hawaii  Swimming   
3  25 Feb-2024  2024.0  Unprovoked  AUSTRALIA  Western Australia       NaN   
4  14 Feb-2024  2024.0  Unprovoked      INDIA        Maharashtra   Fishing   

  sex   age fatal   time  time_numeric  
0   f  13.0     n  16h00          1600  
1   m   NaN     n    nan          1320  
2   f  11.0     n  13h30          1330  
3   f  46.0     n  11h30          1130  
4   m  32.0     n    nan          1320  

info:
<class 'pandas.core.frame.DataFrame'>
Index: 6084 entries, 0 to 6741
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          6084 non-null   object 
 1   year          6084 non-null   float6

In [238]:
# Date formatting column

from datetime import datetime

# Function to reformat the date column
def reformat_date(date_str):
    # If the value is not a string, return "Invalid date"
    if not isinstance(date_str, str):
        return "Invalid date"
    
    # Patterns to match different formats
    full_date_pattern = r'(\d{1,2})-(\w{3})-(\d{4})'  # 16-Jan-2024
    month_year_pattern = r'(\w{3})-(\d{4})'  # Jan-2024
    year_pattern = r'(\d{4})'  # 2024
    
    # Removing any leading words like "Reported"
    date_str = re.sub(r'Reported\s*', '', date_str)
    
    try:
        # Match full date (day-month-year)
        if re.match(full_date_pattern, date_str):
            parsed_date = datetime.strptime(date_str, "%d-%b-%Y")
            return parsed_date.strftime("%m-%Y")  # Return correct month-year
        
        # Match month-year
        elif re.match(month_year_pattern, date_str):
            parsed_date = datetime.strptime(date_str, "%b-%Y")
            return parsed_date.strftime("%m-%Y")  # Return correct month-year
        
        # Match only year
        elif re.match(year_pattern, date_str) and len(date_str) == 4:
            return f"00-{date_str}"  # If only year, set month as '00'
        
        # If nothing matches, return Invalid date
        else:
            return "Invalid date"
    
    except Exception:
        return "Invalid date"

df_copy['date'] = df_copy['date'].astype(str)
df_copy['date'] = df_copy['date'].apply(reformat_date)

In [239]:
# Seasonality function and new seasonality column

def get_seasonality(formatted_date):
    try:
        month = int(formatted_date.split('-')[0])
        if month in [12, 1, 2]:
            return "Winter"
        elif month in [3, 4, 5]:
            return "Spring"
        elif month in [6, 7, 8]:
            return "Summer"
        elif month in [9, 10, 11]:
            return "Autumn"
        else:
            return "Unknown"
    except:
        return "Unknown"

df_copy['seasonality'] = df_copy['date'].apply(get_seasonality)

In [240]:
# Seasonality stats
valid_seasons_df = df_copy[df_copy['seasonality'] != "Unknown"]
season_counts = valid_seasons_df['seasonality'].value_counts()
total_count = len(valid_seasons_df)
season_percentage = (season_counts / total_count) * 100
print(season_percentage)

seasonality
Summer    30.579869
Autumn    24.507659
Winter    23.340627
Spring    21.571845
Name: count, dtype: float64


In [241]:
# Replace 'NaN' with 'unknown':
df_copy['activity'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['activity'].fillna('unknown', inplace=True)


In [242]:
df_copy.head()

Unnamed: 0,date,year,type,country,state,activity,sex,age,fatal,time,time_numeric,seasonality
0,Invalid date,2024.0,Unprovoked,AUSTRALIA,Queensland,Swimming,f,13.0,n,16h00,1600,Unknown
1,Invalid date,2024.0,Unprovoked,USA,Hawaii,Surfing,m,,n,,1320,Unknown
2,Invalid date,2024.0,Unprovoked,USA,Hawaii,Swimming,f,11.0,n,13h30,1330,Unknown
3,Invalid date,2024.0,Unprovoked,AUSTRALIA,Western Australia,unknown,f,46.0,n,11h30,1130,Unknown
4,Invalid date,2024.0,Unprovoked,INDIA,Maharashtra,Fishing,m,32.0,n,,1320,Unknown


In [243]:
# Merging 'country' and 'state' into 'location' to preserve data and enhance precision.
"""df_location = df_copy[["country", "state"]].fillna("")
df_location["location"] = df_location["country"] + ", " + df_location["state"]
df_location["location"] = df_location["location"].str.strip(", ")
df_copy = df_copy.merge(df_location[["country", "state", "location"]], on = ["country", "state"])
df_copy"""

'df_location = df_copy[["country", "state"]].fillna("")\ndf_location["location"] = df_location["country"] + ", " + df_location["state"]\ndf_location["location"] = df_location["location"].str.strip(", ")\ndf_copy = df_copy.merge(df_location[["country", "state", "location"]], on = ["country", "state"])\ndf_copy'