In [1]:
# Library Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
warnings.filterwarnings('ignore')

In [2]:
# Viewing Data
df = pd.read_csv("global_shark_attacks.csv")
df.tail()

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,fatal_y_n,time,species
6885,,,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,males,M,,Y,,
6886,,,Unprovoked,INDONESIA,Riau Province,"Natuna Islands, between Sumatra & Kalimantan i...",Swimming near anchored ship,a ship's engineer,M,,Y,,
6887,,,Unprovoked,IRAQ,Basrah,Shatt-el Arab River near a small boat stand,Swimming,male,M,13 or 14,Y,Afternoon,Bull shark
6888,,,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Crew swimming alongside their anchored ship,male,M,,Y,,
6889,,,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,Y,,


In [3]:
# date, year, area, location, name, sex, age are not needed as they dont affect directly
df.drop(['date', 'year', 'area', 'location', 'name', 'sex', 'age'], axis=1, inplace=True)
df.dropna(inplace=True)
print(len(df))
df.head()

2082


Unnamed: 0,type,country,activity,fatal_y_n,time,species
0,Unprovoked,AUSTRALIA,Surfing,Y,10h10,White shark
1,Unprovoked,AUSTRALIA,Swimming,N,11h20,1m shark
2,Unprovoked,AUSTRALIA,Spearfishing,N,11h30,Bull shark
6,Unprovoked,USA,Surfing,N,16h00,5.5 ft shark
10,Unprovoked,USA,Swimming,N,16h00,"Blacktip shark, 6' to 7'"


In [4]:
# Working with time column
df.time.unique()

array(['10h10', '11h20', '11h30', '16h00', '14h00', 'Evening',
       'Early afternoon', '18h00', '15h30', '15h00', 'Morning', '12h00',
       '10h30', '11h00', '10h25', '09h00', '17h30', '13h00', '08h45',
       '12h50', '18h30', '14h20', '07h30', '08h30', '08h00', '10h00',
       '16h10', '14h35', '18h05', '12h35', '16h30', '14h30', '17h00',
       '12h54', '12h45', '17h50', '08h15', '09h30', '15h35', 'Night',
       '13h30', '09h00 - 09h30', '10h45', 'Late afternoon', '09h15',
       '17h15', 'Afternoon', 'A.M.', '07h58', '20h30', '13h40', '14h11',
       '12h30', '08h20', '15h19', '07h15', '06h00', '20h00', '12h15',
       '13h45', '08h50', '12h05', '>06h45', '16h45', '05h40', '14h50',
       '09h30 / 15h30', '15h50', '09h40', '19h00', '07h20', '15h40',
       '11h05', '12h10', '06h45', '15h05', '09h35', '13h26', '19h30',
       '10h20', '04h00', '15h00 or 15h45', '19h00 / 20h00', '13h50',
       '07h00', '17h10', 'Before 10h30', '21h50', 'Shortly before 12h00',
       '03h30', '00

In [None]:
# Working with time column
for i in df.time:
  try:
    x = int(i[:1])
    if 00 <= x < 6:
      df.time.replace(i, 'Morning', inplace=True)
    elif 6 <= x < 12:
      df.time.replace(i, 'Afternoon', inplace=True)
    elif 12 <= x < 18:
      df.time.replace(i, 'Evening', inplace=True)
    elif 18 <= x < 24:
      df.time.replace(i, 'Night', inplace=True)
  except:
      try:
        if i in ["Early afternoon", "Late afternoon", "Lunchtime", "Shortly before 12h00","Midday", "P.M.","--","Just before sundown","X",">14h30","After noon","Noon","Daytime","Late afternon",">17h00","After lunch","Just after 12h00","Shortly before 13h00","Late Afternoon"]:
          df.time.replace(i, 'Afternoon', inplace=True)
        elif i in [">06h45", "Before 10h30", "Early morning", "Early Morning", "Late morning", "Between 05h00 and 08h00", "Between 06h00 & 07h20", "Before 07h00", "Between 11h00 & 12h00", "Mid-morning", "Just before 11h00", ">08h00", "Dawn", "Before 10h00"]:
          df.time.replace(i, 'Morning', inplace=True)
        elif i in ["Dusk", "dusk", "Sunset", "After dusk"]:
          df.time.replace(i, 'Evening', inplace=True)
        else:
          df.time.replace(i, "Night", inplace=True)
      except:
        pass
df.time.unique()

array(['Night', 'Afternoon', 'Evening', 'Morning'], dtype=object)

In [6]:
# Working with type column
print(df.type.unique())
df.type.value_counts()
for i in df.type:
  if i in ["Boat", "Sea Disaster"]:
    df.type.replace(i, "Unprovoked", inplace=True)
  elif i in ["Watercraft"]:
    df.type.replace(i, "Provoked", inplace=True)
  elif i in ["Questionable"]:
    df.type.replace(i, "Invalid", inplace=True)
df.type.value_counts()

['Unprovoked' 'Invalid' 'Watercraft' 'Provoked' 'Sea Disaster'
 'Questionable' 'Boat']


type
Unprovoked    1678
Provoked       259
Invalid        145
Name: count, dtype: int64

In [7]:
# Working with country column
df.country = df.country.str.title()
replacements = {
    "England": "United Kingdom",
    "Turks And Caicos": "United Kingdom",
    "Turks & Caicos": "United Kingdom",
    "Scotland": "United Kingdom",
    "Grand Cayman": "United Kingdom",
    "North Pacific Ocean": "Pacific Ocean",
    "Usa": "United States Of America",
    "Guam": "United States Of America",
    "American Samoa": "United States Of America",
    "Johnston Island": "United States Of America",
    "Columbia": "United States Of America",
    "Federated States Of Micronesia": "Micronesia",
    "New Caledonia": "France",
    "French Polynesia": "France",
    "Reunion": "France",
    "British West Indies": "Jamaica",
    "New Britain": "Papua New Guinea",
    "Mid Atlantic Ocean": "Atlantic Ocean",
    "St. Maartin": "St Martin",
    "Antigua": "Antigua And Barbuda",
    "Okinawa": "Japan",
    "Andaman / Nicobar Islandas": "India",
    "Egypt / Israel": "Israel",
    "Hong Kong" : "China"
}
df['country'] = df['country'].replace(replacements)
print(df.country.unique())
for i in df.country.value_counts():
  print(i)
df.country.value_counts()
df = df[df.country.map(df.country.value_counts()) >= 10]

['Australia' 'United States Of America' 'New Zealand' 'South Africa'
 'France' 'Bahamas' 'Jamaica' 'Mexico' 'Fiji' 'Taiwan' 'Bermuda'
 'Pacific Ocean' 'Papua New Guinea' 'Spain' 'Mozambique' 'Italy' 'Chile'
 'Greece' 'United Kingdom' 'Tonga' 'Vanuatu' 'Senegal' 'India' 'Maldives'
 'Madagascar' 'Brazil' 'Egypt' 'Atlantic Ocean' 'Japan' 'Seychelles'
 'Indonesia' 'Marshall Islands' 'Micronesia' 'Iraq' 'South Korea' 'Israel'
 'Venezuela' 'Mauritius' 'Palau' 'Ireland' 'Croatia' 'Argentina'
 'Thailand' 'St Martin' 'Antigua And Barbuda' 'Panama' 'China' 'Cuba'
 'Ecuador' 'Vietnam' 'Sudan' 'Solomon Islands' 'Belize' 'Nicaragua'
 'Portugal' 'Persian Gulf' 'Philippines' 'Haiti' 'Costa Rica'
 'El Salvador' 'Colombia' 'Russia']
969
469
272
60
43
43
21
18
16
12
11
10
10
8
8
6
5
5
5
4
4
4
4
4
4
4
4
4
3
3
3
3
3
2
2
2
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [8]:
# Working with activity column
activity_mapping = {
    'Fishing/Hunting': ['fish', 'scallop', 'snork', 'shark', 'shrimp', 'spear', 'net', 'crab', 'gig'],
    'Swimming/Wading': ['swim', 'water', 'wade', 'bath', 'float'],
    'Surfing/Drowning': ['surf', 'splash', 'fell'],
    'Diving/Lifesaving': ['dive', 'life'],
    'Boating/Boarding': ['board', 'ship', 'boat', 'sail', 'row', 'anchor', 'kayak', 'paddle', 'jet', 'racing'],
    'Playing/Walking': ['play', 'stand', 'walking'],
    'Watching/Sitting': ['watch', 'sit', 'seeing'],
    'Feeding': ['feed'],
    'Washing': ['wash']
}

# Map activities based on keywords
for new_activity, keywords in activity_mapping.items():
    for i in df.activity:
        if any(keyword in i.lower() for keyword in keywords):
            df.activity.replace(i, new_activity, inplace=True)

# Remove rows with activities not matching any category
df = df[df.activity.isin(activity_mapping.keys())]

df.activity.unique()

array(['Boating/Boarding', 'Swimming/Wading', 'Fishing/Hunting',
       'Playing/Walking', 'Diving/Lifesaving', 'Watching/Sitting',
       'Washing'], dtype=object)

In [9]:
df = df[df.activity.map(df.activity.value_counts()) > 5]
df.activity.unique()
df.activity.value_counts()
len(df)

1742

In [10]:
# Working with fatal_y_n column
df.fatal_y_n.replace("F", "Y", inplace=True)
df.fatal_y_n.replace("N", 0, inplace=True)
df.fatal_y_n.replace("Y", 1, inplace=True)
df = df[(df.fatal_y_n != "UNKNOWN") & (df.fatal_y_n != '2017.0')]
df.fatal_y_n.unique()

array([1, 0], dtype=object)

In [11]:
# Working with species column
df.species = df.species.str.title()
print(df.species.unique())
species_mapping = {
    'white': 'White Shark',
    'tiger': 'Tiger Shark', 
    'bull': 'Bull Shark',
    'mako': 'Mako Shark',
    'blacktip': 'Blacktip Shark',
    'small': 'Small Shark',
    'wobbegong': 'Wobbegong Shark',
    'caribbean': 'Caribbean Shark',
    'juvenile': 'Juvenile Shark',
    'blue': 'Blue Shark',
    'lemon': 'Lemon Shark',
    'spinner': 'Spinner Shark',
    'dusky': 'Dusky Shark',
    'hammer': 'Hammerhead Shark', 
    'sand': 'Sand Shark',
    'leucas': 'Leucas Shark',
    'galapagos': 'Galapagos Shark',
    'nurse': 'Nurse Shark',
    'bronze': 'Bronze Shark',
    'grey': 'Grey Shark',
    'zambezi': 'Zambezi Shark',
    'zambesi': 'Zambezi Shark',
    'shovel': 'Shovelnose Shark',
    'limbatus': 'Limbatus Shark',
    'gummy': 'Gummy Shark',
    'banjo': 'Banjo Shark',
    'barracuda': 'Barracuda',
    'dolphin': 'Dolphin',
    'dog': 'Dogfish Shark',
    'epau': 'Epaulette Shark',
    'copper': 'Copper Shark', 
    'angel': 'Angel Shark',
    'thresher': 'Thresher Shark',
    'whaler': 'Whaler Shark',
    'salmon': 'Salmon Shark',
    'cow': 'Cow Shark',
    'stringray': 'Stringray',
    'reef': 'Reef Shark'
}

def map_cutter_tooth(species):
    if 'cutter' in species.lower() or 'tooth' in species.lower():
        return 'Cutter/Tooth Shark'
    return species
def map_seven_gill(species):
    if 'seven' in species.lower() or '7' in species.lower():
        return 'Seven Gill Shark'
    return species
for i in df.species:
    # First check special cases
    mapped = map_cutter_tooth(i)
    mapped = map_seven_gill(mapped)
    found_match = False
    for key, value in species_mapping.items():
        if key in mapped.lower():
            df.species.replace(i, value, inplace=True)
            found_match = True
            break
    if not found_match:
        df.species.replace(i, 'Unknown', inplace=True)

df.species.unique()

['White Shark' '1M Shark' 'Bull Shark' '5.5 Ft Shark'
 "Blacktip Shark, 6' To 7'" "Nurse Shark, 5'"
 'Reported As Shark Bite But Injury Caused By Stingray' "3' To 4' Shark"
 'White Shark, 2.5 M' 'A Small Shark' 'Wobbegong Shark' '3 M Shark'
 'White Shark, 3.5 M' "Tiger Shark, 13'" 'Bull Shark, 2.5 M'
 "8' To 10' Shark" "Caribbean Reef Shark, 7' To 8'" "White Shark, 10 '"
 "White Shark, 8' To 10'" "Tiger Shark, 14'" "2' To 3' Shark"
 "3.5' To 4' Shark" 'Nurse Shark' 'Tawny Nurse Shark, 40Cm'
 'Spinner Shark' "6' Shark" 'Bull Shark, 3M' "5' Shark"
 "8' White Shark Or 7-Gill Shark" '2 M Shark' "3' Shark" "6' To 8' Shark"
 "Tiger Shark, 5'" "6 M [20'] White Shark"
 "White Shark, 2.4 M To 3 M [8' To 10']" "Tiger Shark, 3 M [10']"
 "A 2' Shark Was Seen In The Area By Witnesses" 'Juvenile Shark'
 'Tiger Shark?' "1.8 M [6'] Shark, Possibly A Blacktip"
 "4 M To 5 M [13' To 16.5']  White Shark" "2.4 M [8'] Shark"
 "5 M To 6 M [16.5' To 20'] White Shark" "Tiger Shark, 3.7 M [12']"
 "Lemon Shark, 

array(['White Shark', 'Unknown', 'Bull Shark', 'Nurse Shark',
       'Small Shark', 'Wobbegong Shark', 'Tiger Shark', 'Spinner Shark',
       'Juvenile Shark', 'Blacktip Shark', 'Leucas Shark', 'Blue Shark',
       'Mako Shark', 'Dusky Shark', 'Hammerhead Shark', 'Bronze Shark',
       'Grey Shark', 'Shovelnose Shark', 'Galapagos Shark',
       'Caribbean Shark', 'Limbatus Shark', 'Lemon Shark', 'Sand Shark',
       'Zambezi Shark', 'Gummy Shark', 'Reef Shark', 'Banjo Shark',
       'Epaulette Shark', 'Angel Shark', 'Barracuda', 'Dogfish Shark',
       'Thresher Shark', 'Copper Shark', 'Whaler Shark', 'Salmon Shark',
       'Cow Shark'], dtype=object)

In [12]:
# Working with time_phase column
df.rename(columns={"time":"time_phase"}, inplace=True)
df.time_phase.replace("Morning", "1", inplace=True)
df.time_phase.replace("Afternoon", "2", inplace=True)
df.time_phase.replace("Evening", "3", inplace=True)
df.time_phase.replace("Night", "4", inplace=True)
df.tail(60)

Unnamed: 0,type,country,activity,fatal_y_n,time_phase,species
6563,Unprovoked,Mexico,Boating/Boarding,0,4,Unknown
6576,Unprovoked,South Africa,Boating/Boarding,1,4,Unknown
6590,Provoked,United States Of America,Fishing/Hunting,0,4,Mako Shark
6606,Unprovoked,Australia,Swimming/Wading,1,4,Unknown
6616,Unprovoked,South Africa,Swimming/Wading,1,4,Unknown
6618,Unprovoked,Australia,Swimming/Wading,0,4,Unknown
6628,Unprovoked,United States Of America,Fishing/Hunting,0,4,Unknown
6632,Provoked,United States Of America,Boating/Boarding,0,4,Unknown
6649,Unprovoked,France,Boating/Boarding,0,4,Grey Shark
6653,Unprovoked,United States Of America,Swimming/Wading,0,4,Bull Shark


In [13]:
# Got many errors here just to realize why we should use labelencoder instead of putting numbers replacing the values
y = df['fatal_y_n']
print(y.dtype)
print(y.unique())
df['fatal_y_n'] = pd.to_numeric(df['fatal_y_n'], errors='coerce')
y = df['fatal_y_n']
print(y.dtype)
print(y.unique())

object
[1 0]
int64
[1 0]


In [14]:
df = df.dropna(subset=['fatal_y_n'])
X = df.drop('fatal_y_n', axis=1)
y = df['fatal_y_n']
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

categorical_features = ['country', 'activity', 'type', 'species']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

models = [
    {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(),
        'params': {
            'classifier__penalty': ['l2'],
            'classifier__C': [0.1, 1, 10],
            'classifier__solver': ['lbfgs', 'saga'],
            'classifier__class_weight': ['balanced', None]
        }
    },
    {
        'name': 'Random Forest',
        'estimator': RandomForestClassifier(),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [None, 10],
            'classifier__min_samples_split': [2, 5],
            'classifier__class_weight': ['balanced', 'balanced_subsample']
        }
    },
    {
        'name': 'XGBoost',
        'estimator': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__max_depth': [3, 6],
            'classifier__n_estimators': [100, 200],
            'classifier__scale_pos_weight': [1, sum(y_train == 0)/sum(y_train == 1)]
        }
    },
    {
        'name': 'Gradient Boosting',
        'estimator': GradientBoostingClassifier(),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__learning_rate': [0.05, 0.1],
            'classifier__max_depth': [3, 5]
        }
    }
]

results = []

for model in models:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model['estimator'])
    ])

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=model['params'],
        scoring='roc_auc',
        cv=5,
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)

    best_params = grid.best_params_
    best_score = grid.best_score_
    test_score = grid.score(X_test, y_test)
    y_pred = grid.predict(X_test)

    results.append({
        'model': model['name'],
        'best_params': best_params,
        'train_auc': best_score,
        'test_auc': test_score,
        'classification_report': classification_report(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
print("Model Comparison:\n")
print(results_df[['model', 'train_auc', 'test_auc']].to_string(index=False))
best_model = results_df.loc[results_df['test_auc'].idxmax()]
print("\n\nBest Model:", best_model['model'])
print("Test AUC:", best_model['test_auc'])
print("Classification Report:\n", best_model['classification_report'])
print("Best Parameters:\n", best_model['best_params'])

(1638, 5)
(1638,)
Model Comparison:

              model  train_auc  test_auc
Logistic Regression   0.827789  0.803062
      Random Forest   0.820803  0.810993
            XGBoost   0.822025  0.802277
  Gradient Boosting   0.817500  0.811896


Best Model: Gradient Boosting
Test AUC: 0.8118963486454652
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       283
           1       0.43      0.20      0.27        45

    accuracy                           0.85       328
   macro avg       0.66      0.58      0.60       328
weighted avg       0.82      0.85      0.83       328

Best Parameters:
 {'classifier__learning_rate': 0.05, 'classifier__max_depth': 5, 'classifier__n_estimators': 100}


In [15]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l2',
        C=1,
        solver='saga',
        class_weight='balanced'
    ))
])
pipeline.fit(X_train, y_train)

In [16]:
joblib.dump(pipeline, 'Global_Shark_Attack.joblib')
with open('Global_Shark_Attack.pkl', 'wb') as f:
    pickle.dump(pipeline, f)