In [2]:
# Initial Setup:
import pandas as pd
import numpy as np
import re

url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
df = pd.read_excel(url)

In [3]:
# Preview of raw DataFrame:
print("head:")
print(df.head())

print("\ninfo:")
df.info()

print("\ndescribe:")
print(df.describe())

print("\ncolumns:")
print(df.columns.tolist())

head:
          Date    Year        Type    Country              State  \
0  15 Mar 2024  2024.0  Unprovoked  AUSTRALIA         Queensland   
1  04 Mar 2024  2024.0  Unprovoked        USA             Hawaii   
2  02 Mar-2024  2024.0  Unprovoked        USA             Hawaii   
3  25 Feb-2024  2024.0  Unprovoked  AUSTRALIA  Western Australia   
4  14 Feb-2024  2024.0  Unprovoked      INDIA        Maharashtra   

                           Location  Activity                 Name Sex  Age  \
0                     Bargara Beach  Swimming       Brooklyn Sauer   F   13   
1                Old Man's, Waikiki   Surfing        Matthew White   M  NaN   
2                    Rainbows, Oahu  Swimming                  NaN   F   11   
3        Sandlnd Island, Jurian Bay       NaN               female   F   46   
4  Vaitarna River, Palghar District   Fishing  Vicky Suresh Govari   M   32   

   ...        Species                      Source  pdf href formula href  \
0  ...     Tiger shark      Yahoo 

In [4]:
# DataFrame cleaning preparation:

# 1. Dropping unneeded columns and duplicates:
columns_to_drop = ["Source", "Location", "Injury", "Name", "pdf", "href formula",
                   "href", "Case Number", "Case Number.1", "original order",
                   "Unnamed: 21", "Unnamed: 22", "Species "]

df = df.drop(columns=columns_to_drop).drop_duplicates().reset_index(drop=True)

# 2. Filter rows with Year > 1800 and remove unneeded 'Types':
df = df[df["Year"] > 1800]

# 3. Types of shark attacks to exclude:
undesired_types = ["Questionable", "Boat", "Provoked", "Provoked ", "?",
                   "Unverified", "Under investigation", "Unconfirmed"]

df = df[~df["Type"].isin(undesired_types)]

# 4. Renaming and reformatting columns:
df.columns = [col.strip().replace(" ", "_").replace(".", "").lower() for col in df.columns]
df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)

# 5. Creating a copy of the original DataFrame for further manipulation
df_copy = df.copy()

# Check:
print(df.columns.tolist())

['date', 'year', 'type', 'country', 'state', 'activity', 'sex', 'age', 'fatal', 'time']


In [5]:
# Cleaning 'fatal' column:
value_map = {'n': 'n', 'y': 'y'}

df_copy['fatal'] = df_copy['fatal'].str.strip().str.lower().map(value_map)

# Calculate mode:
fatal_mode = df_copy['fatal'].mode()[0]

# Replace NaNs with 'n':
df_copy['fatal'] = df_copy['fatal'].fillna(fatal_mode)

# Check:
df_copy['fatal'].value_counts(dropna=False)

fatal
n    4740
y    1344
Name: count, dtype: int64

In [6]:
# Cleaning 'sex' column:
df_copy['sex'] = df_copy['sex'].str.strip().str.lower()

# Replace wrong values with 'unknown':
invalid_entries = ['lli', 'm x 2', 'n', '.']
for entry in invalid_entries:
    df_copy['sex'] = df_copy['sex'].replace(entry, 'unknown')

# Replace missing values with 'unknown':
df_copy['sex'].fillna('unknown', inplace=True)

# Check:
df_copy['sex'].value_counts(dropna=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['sex'].fillna('unknown', inplace=True)


sex
m          4856
f           729
unknown     499
Name: count, dtype: int64

In [7]:
#Cleaning 'type' column:

# Remove 'invalid' values:
df_copy = df_copy[df_copy['type'] != 'invalid']

# Replace 'NaN' with 'unknown':
df_copy['type'].fillna('unknown', inplace=True)

# Check:
df_copy['type'].value_counts(dropna=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['type'].fillna('unknown', inplace=True)


type
Unprovoked      4937
Invalid          546
Watercraft       349
Sea Disaster     234
unknown           16
 Provoked          2
Name: count, dtype: int64

In [8]:
# Cleaning 'time' column:
df_copy['time'] = df_copy['time'].astype(str)

# Function to validate if time is in the correct "hhmm" format (e.g., "1630"):
def validate_time_format(time_str):
    # Regex pattern for valid time formats like "1600", "0150", etc.
    pattern = r'^\d{2}h\d{2}$'
    if re.match(pattern, time_str):
        return time_str
    else:
        return None  # Invalid format, will replace with mean later.

# Apply the function and remove 'h', replace invalid/missing times with 'None':
df_copy['time'] = df_copy['time'].apply(lambda x: x.replace('h', '') if validate_time_format(x) else None)

# Convert 'time' to numeric, keeping NaN as it is to calculate mean time later:
df_copy['time'] = pd.to_numeric(df_copy['time'], errors='coerce')

# Calculate the mean time, ignoring NaN values in the calculation:
mean_time = int(df_copy['time'].mean(skipna=True))

# Fill in missing or invalid times with the mean time calculated
df_copy['time'].fillna(mean_time, inplace=True)

# Ensure 'time' column is integer:
df_copy['time'] = df_copy['time'].astype(int)

# Check:
df_copy['time'].value_counts(dropna=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['time'].fillna(mean_time, inplace=True)


time
1319    3543
1100     130
1500     119
1200     116
1600     115
        ... 
1623       1
1955       1
130        1
858        1
150        1
Name: count, Length: 266, dtype: int64

In [9]:
# Cleaning 'age' column:

# Converting 'age' to numeric, setting errors to NaN:
df_copy['age'] = pd.to_numeric(df_copy['age'], errors='coerce')

# Calculating the mean age, excluding NaNs:
age_mean = df_copy['age'].mean()
print(f"Mean age: {age_mean}")

# Filling missing 'age' values with the calculated mean:
df_copy['age'].fillna(age_mean, inplace=True)

# Rounding 'age' values to nearest whole number:
df_copy['age'] = df_copy['age'].round(0)

# Check:
df_copy['age'].value_counts(dropna=False)

Mean age: 27.718184429761564


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['age'].fillna(age_mean, inplace=True)


age
28.0    2685
17.0     167
18.0     146
15.0     142
16.0     140
        ... 
72.0       1
84.0       1
86.0       1
87.0       1
81.0       1
Name: count, Length: 81, dtype: int64

In [10]:
# Subsequent overview of v0.5:
# Cell to be (re)moved or updated as we refine our analysis and finalize other KPIs.

print("head:")
print(df_copy.head())

print("\ninfo:")
df_copy.info()

print("\ndescribe:")
print(df_copy.describe())

print("\ncolumns:")
print(df_copy.columns.tolist())

print("\nmissing values:")
print(df_copy.isnull().sum()[df_copy.isnull().sum() > 0])

# Pending Cleaning Columns: 'date', 'year', 'activity', 'country', 'state'.
# Note on Geo Data: Considering merging 'country' and 'state' into 'location' to preserve data and enhance precision.

# Upcoming in v0.6:
# - Cleaning for 'date', 'year', 'type' and their integration.
# - Decision pending on creating a 'location' column and its integration.

# Uniform Commentary: Please keep documentation clear and accessible.

head:
          date    year        type    country              state  activity  \
0  15 Mar 2024  2024.0  Unprovoked  AUSTRALIA         Queensland  Swimming   
1  04 Mar 2024  2024.0  Unprovoked        USA             Hawaii   Surfing   
2  02 Mar-2024  2024.0  Unprovoked        USA             Hawaii  Swimming   
3  25 Feb-2024  2024.0  Unprovoked  AUSTRALIA  Western Australia       NaN   
4  14 Feb-2024  2024.0  Unprovoked      INDIA        Maharashtra   Fishing   

  sex   age fatal  time  
0   f  13.0     n  1600  
1   m  28.0     n  1319  
2   f  11.0     n  1330  
3   f  46.0     n  1130  
4   m  32.0     n  1319  

info:
<class 'pandas.core.frame.DataFrame'>
Index: 6084 entries, 0 to 6741
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      6084 non-null   object 
 1   year      6084 non-null   float64
 2   type      6084 non-null   object 
 3   country   6047 non-null   object 
 4   state     5686 non-nu