Import Libraries


In [None]:
mport pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import seaborn as sns

Load Data

In [None]:
def load_data(url):
    original_df = pd.read_excel(url)
    df = original_df.copy()
    return df

url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"
df = load_data(url)

Get First Impression of Dataset

In [None]:
def get_first_impression(df, n=3):
    return df.head(n)

print(get_first_impression(df))

Cell 4: Dataset Shape

def get_shape(df):
    return df.shape

print(get_shape(df))

Cell 5: Clean Column Names

In [1]:
def clean_column_names(df):
    df = df.rename(columns={"Species ": "Species"})
    return df

df = clean_column_names(df)
print(df.columns)

NameError: name 'df' is not defined

6- Select Relevant Columns

In [2]:
def select_columns(df, columns):
    df = df[columns]
    return df

columns = ["Date", "Year", "Country", "Location", "Injury", "Species", "Activity"]
df = select_columns(df, columns)

NameError: name 'df' is not defined

Cell 7: Drop Rows with All NaN Values


In [None]:
def drop_all_nan_rows(df):
    df = df.dropna(how='all')
    return df

df = drop_all_nan_rows(df)
print(get_shape(df))

Cell 8: Drop Rows with NaN Year


In [None]:
def drop_nan_year(df):
    df = df.dropna(subset=['Year'])
    return df

df = drop_nan_year(df)
print(get_first_impression(df))

Cell 9: Correct Year Format


In [None]:
def correct_year_format(df):
    df['Year'] = df['Year'].astype(int)
    return df

df = correct_year_format(df)
print(get_first_impression(df))

Cell 10: Filter Data for Last 10 Years


In [None]:
def filter_last_10_years(df):
    df = df[df['Year'] >= 2015]
    return df

df = filter_last_10_years(df)

Cell 11: Incident Counts by Year


In [None]:
def incident_counts_by_year(df):
    return df['Year'].value_counts()

print(incident_counts_by_year(df))

Cell 12: Incident Counts by Country


In [None]:
def incident_counts_by_country(df, top_n=5):
    return df['Country'].value_counts().head(top_n)

print(incident_counts_by_country(df))

Cell 13: Incident Counts by Location


In [None]:
def incident_counts_by_location(df, top_n=5):
    return df['Location'].value_counts().head(top_n)

print(incident_counts_by_location(df))

Cell 14: Explore Injury Types


In [None]:
def explore_injury_types(df, top_n=10):
    return df['Injury'].value_counts().head(top_n)

print(explore_injury_types(df))

Cell 15: Unique Injury Types


In [None]:
def unique_injury_types(df):
    return df['Injury'].nunique()

print(unique_injury_types(df))

Cell 16: Dataset Information


In [None]:
def dataset_info(df):
    return df.info()

dataset_info(df)

Cell 17: Missing Values


In [None]:
def missing_values(df):
    return df.isnull().sum()

print(missing_values(df))

Cell 18: Group by Location and Country


In [None]:
def group_by_location_country(df):
    filtered_countries = df[df['Country'].isin(['USA', 'AUSTRALIA'])]
    grouped_location = filtered_countries.groupby(['Location', 'Country']).size().unstack(fill_value=0)
    return grouped_location

grouped_location = group_by_location_country(df)
print(grouped_location)

Cell 19: Pie Chart of Incidents


In [None]:
def plot_pie_chart(grouped_location):
    australia_sum = grouped_location["AUSTRALIA"].sum()
    usa_sum = grouped_location["USA"].sum()
    labels = ['USA', 'AUSTRALIA']
    sizes = [usa_sum, australia_sum]
    plt.figure(figsize=(10, 5))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title('Shark Attacks in USA and Australia')
    plt.show()

plot_pie_chart(grouped_location)

Cell 20: Filter Incidents in USA


In [None]:
def filter_incidents_usa(df):
    df_usa = df[df['Country'] == 'USA'].copy()
    return df_usa

df_usa = filter_incidents_usa(df)
print(get_first_impression(df_usa))

Cell 21: Species Count in USA


In [None]:
def species_count_usa(df_usa, top_n=10):
    return df_usa['Species'].value_counts().head(top_n)

print(species_count_usa(df_usa))

Cell 22: Format Date Column


In [None]:
def format_date_column(df_usa):
    df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'[-\s,]', '', str(x)))
    df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'(\d{4})', r'\1-', str(x)))
    df_usa['Date'] = pd.to_datetime(df_usa['Date'], errors='coerce')
    df_usa['Date'] = df_usa['Date'].dt.strftime('%d-%m-%y')
    df_usa['Date'] = pd.to_datetime(df_usa['Date'], format='%d-%m-%y', errors='coerce')
    return df_usa

df_usa = format_date_column(df_usa)
print(get_first_impression(df_usa))

Cell 23: Add Season Column


In [None]:
def add_season_column(df_usa):
    def get_season(Date):
        if pd.isna(Date):
            return 'Unknown'
        month = pd.to_datetime(Date).month
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        elif month in [9, 10, 11]:
            return 'Fall'
    
    df_usa['Season'] = df_usa['Date'].apply(get_season)
    return df_usa

df_usa = add_season_column(df_usa)
print(get_first_impression(df_usa))

Cell 24: Group by Season and Year


In [None]:
def group_by_season_year(df_usa):
    grouped_season = df_usa.groupby(['Season', 'Year']).size().unstack(fill_value=0)
    grouped_season['Total sum'] = grouped_season.sum(axis=1)
    grouped_season = grouped_season.drop('Unknown')
    return grouped_season

grouped_season = group_by_season_year(df_usa)
print(grouped_season)

Cell 25: Plot Incidents by Season


In [None]:
def plot_incidents_by_season(grouped_season):
    grouped_season[['Total sum']].plot(kind='bar', stacked=True, figsize=(10, 5))
    plt.title('Shark Attacks in the USA by Season')
    plt.ylabel('Number of Attacks')
    plt.xlabel('Season')
    plt.xticks(rotation=0)
    plt.legend(loc='upper right')
    plt.show()

plot_incidents_by_season(grouped_season)

Cell 26: Standardize Species Names


In [None]:
def standardize_species_names(df_usa):
    df_usa['Species'].fillna("Not Specified", inplace=True)
    species_replacements = {
        "white shark": "White Shark",
        "tiger shark": "Tiger Shark",
        "bull shark": "Bull Shark",
        "nurse shark": "Nurse Shark",
        "blacktip shark": "Blacktip Shark",
        "hammerhead shark": "Hammerhead Shark",
        "mako shark": "Mako Shark",
        "lemon shark": "Lemon Shark",
        "blue shark": "Blue Shark",
        "spinner shark": "Spinner Shark",
        "sand tiger shark": "Sand Tiger Shark",
        "dusky shark": "Dusky Shark",
        "caribbean reef shark": "Caribbean Reef Shark",
        "galapagos shark": "Galapagos Shark",
        "zambesi shark": "Zambesi Shark"
    }
    
    for key, value in species_replacements.items():
        df_usa['Species'] = df_usa['Species'].apply(lambda x: value if key in str(x).lower() else x)
    
    common_species = df_usa['Species'].value_counts().head(7).index
    df_usa['Species'] = df_usa['Species'].apply(lambda x: x if str(x) in common_species else "Others")
    
    return df_usa

df_usa = standardize_species_names(df_usa)

Cell 27: Plot Shark Attacks by Species


In [None]:
def plot_shark_attacks_by_species(df_usa):
    fig = sns.countplot(y="Species", data=df_usa, order=df_usa["Species"].value_counts().index)
    fig.set_title('Shark Attacks by Species')
    plt.show()

plot_shark_attacks_by_species(df_usa)

Cell 28: Final Data Check


In [None]:
def final_data_check(df_usa):
    print(df_usa.head())
    print(df_usa['Species'].value_counts().head(10))
    print(df_usa['Species'].isnull().sum())
    print(df_usa['Species'].value_counts())

final_data_check(df_usa)

# Conclusion
In this project, we performed data cleaning, exploration, and visualization to understand shark attack incidents. We identified the most common species involved, analyzed the data by year and location, and created visualizations to represent the findings.