<a href="https://colab.research.google.com/github/MevrouwHelderder/Assignments/blob/main/Assignment_Shark_Attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext google.colab.data_table

In [441]:
# Importing the essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# importing the dataframe
path = "/content/drive/MyDrive/attacks.csv"
attacks = pd.read_csv(path, encoding="ISO-8859-1")
# Dropping columns.
attacks_drop_columns = attacks.drop(
    columns=[
        "Date",
        "Year",
        "Country",
        "Area",
        "Location",
        "Name",
        "Sex ",
        "Time",
        "Investigator or Source",
        "pdf",
        "href formula",
        "href",
        "Case Number.1",
        "Case Number.2",
        "original order",
        "Unnamed: 22",
        "Unnamed: 23",
    ]
)
# Renaming columns.
attacks_renamed = attacks_drop_columns.rename(
    columns={"Case Number": "Case", "Fatal (Y/N)": "Outcome", "Species ": "Species", "Ages": "Age"}
)
# Dropping rows.
# Making a copy to prevent view vs copy issues later on.
relevant_columns = list(attacks_renamed.columns[1:])
attacks_drop_rows = attacks_renamed.dropna(subset=relevant_columns, how="all").copy()
# Preparing the functions for locating and adjusting the missing values.
def print_separator(sep, num, msg):
    print("\n")
    print(sep * num)
    print(f"{msg}")
    print(sep * num)


# TACTIC A: find unique values
def look_at_unique_values(column):
    unique_values_cutoff = 50
    unique_values = column.unique()
    num_unique_values = len(unique_values)
    if num_unique_values == len(column):
        print(f"Each value in the column is unique (total: {num_unique_values})")
    elif num_unique_values < unique_values_cutoff:
        print(f"Less than {unique_values_cutoff} unique values:")
        try:
            sorted = np.sort(unique_values)
            print("Values are sorted")
            display(list(sorted))
        except:
            print("Could not sort values")
            display(list(unique_values))
    else:
        print(
            f"More than {unique_values_cutoff} unique values (total: {num_unique_values})"
        )


# TACTIC B: look at the edges
def look_at_edges(df, column_name):
    # inner function
    def show_head_and_tail(values):
        num_items_to_slice = 10
        display(list(values)[:num_items_to_slice])
        display(list(values)[-num_items_to_slice:])

    column = df[column_name]
    unique_values = column.unique()
    try:
        sorted = np.sort(unique_values)
        print("Unique values sorted, head and tail:")
        show_head_and_tail(sorted)
    except TypeError as error:
        print(f"Could not sort values: {error}")
        print("..so let's try filtering NULL values and then sorting")
        print("..there could be a black sheep in the null values")
        non_null_uniques = df.loc[~df[column_name].isnull(), column_name].unique()
        sorted = np.sort(non_null_uniques)
        show_head_and_tail(sorted)


# TACTIC C: casting to a type to see if all the values match the needed type
def cast_to_type(column, maybe_type):
    try:
        column.astype(maybe_type)
        print(f"Casting to {maybe_type} was successful")
    except ValueError as error:
        print(f"Could not cast to {maybe_type}: {error}")


# TACTIC D: display the value count of the column
def value_count(column):
    display(column.value_counts(dropna=False))


# FUNCTION TO CHECK THE DATAFRAME FOR ALL FOUR TACTICS
def find_non_default_missing_values(df, column_name, maybe_type):
    long_separator_amount = 80
    short_separator_amount = 40
    # Print the header
    print_separator(
        "*",
        long_separator_amount,
        f'Finding non default missing values for column "{column_name}"',
    )
    print(f'Column "{column_name}" has datatype: {df.dtypes[column_name]}')
    column = df[column_name]
    # A
    print_separator("-", short_separator_amount, "A: Looking at unique values")
    look_at_unique_values(column)
    # B
    print_separator("-", short_separator_amount, "B: Sorting and looking at the edges")
    look_at_edges(df, column_name)
    # C
    print_separator("-", short_separator_amount, f"C: Casting to type: {maybe_type}")
    cast_to_type(column, maybe_type)
    # D
    print_separator(
        "-",
        short_separator_amount,
        "D: Looking at frequency\nAll default-NULL values will be bunched together as NaN",
    )
    value_count(column)
    print("\n")


# Function to replace non-default NULL values with default NULL values.
# ⚠️ Mutates df
def replace_value(df, column_name, missing_old, missing_new):
    df[column_name] = df[column_name].replace({missing_old: missing_new})


# Function to display the default NULL values in the column.
def display_default_null_values(df, column_name):
    nulls = df.loc[df[column_name].isnull()]
    print(f'Number of default null values in "{column_name}": {len(nulls)}')


# Easier to type
nat = np.datetime64("nat")


In [442]:
# Making a deep copy of the dataframe for more clarity while working on it
attacks_clean = attacks_drop_rows.copy(deep=True)

# Lowercase all strings and strip whitespace and/or quotationmarks around strings
attacks_clean = attacks_clean.applymap(lambda x: x.lower() if isinstance(x, str) else x)
attacks_clean = attacks_clean.applymap(lambda x: x.strip('" ') if isinstance(x, str) else x)

# Column: Species

First let me apologize for the amount stuff done that is probably technically not all needed for this assignment ;-)
I had a blast cleaning op this column and I used it to practice a lot of new skills.
Also: I recognise that in real life it would probably almost always be a waste of time to refine values that occur only a few times but I appreciated the practice ;-)

In [None]:
# Checking how everthing looks and what stands out: 

find_non_default_missing_values(attacks_clean, 'Species', 'string')

In [443]:
# First steps cleaning up:
# Goal:
# extract the species from the string where possible, change null values
# where needed


def tidy(x):
    if pd.isna(x):
        return None
    elif len(x.strip()) == 0:
        return None
    elif "shark" in x:
        return re.search(r"(\S+\s*)?shark", x).group()
    else:
        return f"check: {x}"


attacks_clean["Species"] = attacks_clean["Species"].apply(tidy)

# Things we can safely change to "no species confirmed":
no_species = [
    "invalid",
    "unidentified",
    "questionable",
    "possibly",
    "not confirmed",
    "unconfirmed",
    "doubtful",
    "captive",
    "unknown",
    "several",
    "colored",
]

# one or more digits followed by ' or " followed by
# zero or more ] followed by shark, whitespaces optional
inches = r'\d+\s*(["\']{1,})\s*\]*\s*shark'

# string containing two or less letters or digits or -
# followed by shark, whitespaces optional
small_string = r"^[a-z0-9-]{0,2}\s*shark$"

# lb or kg or foot followed by zero or more ] followed by shark, whitespaces optional
measurements = r"(kg|lb|foot)\s*\]*\s*shark"


def tidy_more(x):
    if x is not None and (
        any(word in x for word in no_species)
        or re.search(inches, x)
        or re.search(small_string, x)
        or re.search(measurements, x)
    ):
        return None
    else:
        return x


attacks_clean["Species"] = attacks_clean["Species"].apply(tidy_more)

# removing quotation marks and this weird little fellas that look the same but are different:  
def remove_weirdos(x):
    if x is None:
        return None
    else:
        return re.sub(r'["]+', "", x)

attacks_clean["Species"] = attacks_clean["Species"].apply(remove_weirdos)

# Checking all values that I previously marked as 'check'
mask_species = attacks_clean["Species"].str.contains("check", na=False)
attacks_clean[mask_species]

# Changing to the right species where possible
correct_species = [
    "blue pointer",
    "wobbegong",
    "whaler",
    "hammerhead",
    "porbeagle",
    "whitetip",
    "horn",
]

def correct_checks(x):
    if x is not None and ("check" in x):
        for word in correct_species:
            if word in x:
                return f"{word} shark"
        else:
            return None
    else:
        return x

attacks_clean["Species"] = attacks_clean["Species"].apply(lambda x: correct_checks(x))


# Last crumbs to clean up:
# attacks_clean['Species'].value_counts().head(50)
# attacks_clean['Species'].value_counts().tail(50)

useless = [
    "large shark",
    "female shark",
    "grey shark",
    "two shark",
    "the shark",
    "from shark",
    "little shark",
    "larger shark",
    "red shark",
    "young shark",
    "for shark",
    "metre shark",
    "juvenile shark",
    "gray shark",
    "finned shark",
]
attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(useless, None)
)

attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(
        ["seven-gill shark", "7-gill shark", "sevengill  shark"], "sevengill shark"
    )
)
attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(["black-tipped shark", "blacktip  shark"], "blacktip shark")
)
attacks_clean["Species"] = attacks_clean["Species"].replace(
    {"sand shark": "sandshark", "zambesi shark": "zambezi shark"}
)

# Those last values are probably useless but since there is no way to definitively know if they represent a real species or not I'll leave them as is for now.
# attacks_clean['Species'].value_counts().head(50)
# attacks_clean['Species'].value_counts().tail(50)

# Column: Case

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Case", "string")

In [None]:
# There are some duplicate values. 
# Checking what they mean: 

# attacks_clean[attacks_clean.duplicated('Case', keep=False)]

# They might indicatie cases where more then one person was attacked. 
# The columns regarding location and time might help here if needed but since 
# there are few and they seem irrelevant to our questions I will leave them for now

# Column: Type

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Type", "string")

In [444]:
# Replace a few different values that all mean the same plus replace np.nan.
attacks_clean["Type"] = attacks_clean["Type"].replace({"boating" : "boat", "boatomg" : "boat", np.nan: None})


# Column: Activity

In [None]:
# Column Activity
# Checking how everthing looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Activity", "string")

In [445]:
# Replacing the some things that stand out right away
attacks_clean["Activity"] = attacks_clean["Activity"].replace({".": None, np.nan: None})

In [None]:
# Checking all none - unique values: 
# attacks_clean[attacks_clean.duplicated("Activity", keep=False)]

# Looking at the head and tail of the column to see what stands out:
# attacks_clean["Activity"].value_counts().head(60)
# attacks2["Activity"].value_counts().tail(50)

In [446]:
# Combining different spellings of the same value
attacks_clean["Activity"] = attacks_clean["Activity"].replace(
    dict.fromkeys(["boogie boarding", "paddle boarding", "body-boarding", "body boarding", "paddle-boarding"], "bodyboarding")
)

# Taking the top 60 values as categories and dividing the other values, where possible,
# into those categories.

top_activities = list(attacks_clean["Activity"].value_counts().head(60).index)

def activities_checks(x):
    if x is not None:
      for word in top_activities:
          if word in x:
              return word
      else:
          return x

attacks_clean["Activity"] = attacks_clean["Activity"].apply(lambda x: activities_checks(x))

# Column: Fatal

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Outcome", "string")

In [447]:
# Changing values to corresponding categories
attacks_clean["Outcome"] = attacks_clean["Outcome"].replace({"n" : "nonfatal", "y" : "fatal", "m" : "unknown", "2017": "unknown", np.nan : None})

# Column: Age

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Age", "string")

In [448]:
# change NaN to None
attacks_clean["Age"] = attacks_clean["Age"].replace({np.nan : None, "": None})

# convert non-null values to string
non_null = attacks_clean["Age"].notnull()
attacks_clean.loc[non_null, "Age"] = attacks_clean.loc[non_null, "Age"].astype(str)

# Sorting ages by range
child = [str(i) for i in range(0, 13)]
adolescent = [str(i) for i in range(13, 18)]
adult = [str(i) for i in range(18, 130)]

# Sorting all the values that describe the age in words
# As noticed when evaluating what couldn't be done automatically
child_words = ["child", "2 to 3 months", "9 months", "18 months", "2½", "both 11", "6½"]
adolescent_words = ["teen", "teens"]
adult_words = ["adult" , "middle-age", "(adult)", "20s", "30s", "40s", "50s", "elderly", "60's", "60s", "mid-20s", "mid-30s", "ca. 33"]

# Sorting into age groups, also tackling values that mention multiple ages
# Making a new column for the age_groups
def age_groups(age):
    if age is None:
        return age
    else:
        age_list = re.split(r'[,&\s?]|(?:\s*(?:to|or)\s*)', age) # split into seperate ages where needed
        age_list = list(filter(None, age_list))  # remove any empty strings  
        if all(word in child for word in age_list) or age in child_words:
            return "child"
        elif all(word in adolescent for word in age_list) or age in adolescent_words:
            return "adolescent"
        elif all(word in adult for word in age_list) or age in adult_words:
            return "adult"
        else:
            return None

attacks_clean.insert(4, "Age_group", attacks_clean["Age"].apply(age_groups))

# Column: Injury

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Injury", "string")

In [605]:
# temporary copy:
copy = attacks_clean.copy(deep=True)

In [606]:
# Changing Nan to None
copy["Injury"] = copy["Injury"].replace({np.nan : None, "": None})

In [607]:
# Making a new column and filling it with whith the values of "Injury".
copy.insert(5, "Injury_2", copy["Injury"])

In [608]:
# Removing some words for better sorting
copy["Injury_2"] = copy["Injury_2"].str.replace(r"\bleft\b|\bright\b|\bminor\b|\bsevere\b|\blower\b|\bto dorsum of\b", "", regex=True).str.strip()
# copy["Injury_2"] = copy["Injury_2"].str.replace(r"\bleft\b|\bright\b", "", regex=True).str.strip()

# Correcting some words for better sorting
copy["Injury_2"] = copy["Injury_2"].replace({"injury":"injuries"})

# Replace values that contain "fatal" with just "fatal"
copy["Injury_2"] = copy["Injury_2"].str.replace(r".*\bfatal\b.*", "fatal", regex=True)

# Replace values that contain "no injury" with just "no injury"
copy["Injury_2"] = copy["Injury_2"].str.replace(r".*\bno injury\b.*", "no injury", regex=True)

In [609]:
# Replace "laceration" with "lacerations"
copy["Injury_2"] = copy["Injury_2"].str.replace(r"\blaceration\b", "lacerations", regex=True)
# Replace "lacerated" with "lacerations to"
copy["Injury_2"] = copy["Injury_2"].str.replace(r"\blacerated\b", "lacerations to", regex=True)
# copy["Injury_2"] = copy["Injury_2"].str.replace(r'\b(lacerated|laceration)\b', 'lacerations to', regex=True)


In [610]:
# If "lacerations to" is present: place it at the front of the string

def move_lacerations_to_front(injury):
    if injury is None:
        return injury
    else: 
      if "lacerations to" in injury:
         return "lacerations to " + injury.replace("lacerations to", "").strip()
      else:
         return injury.strip()

# Apply the function to the "Injury_2" column
copy["Injury_2"] = copy["Injury_2"].apply(move_lacerations_to_front)


In [611]:
# Remove double whitespaces
copy["Injury_2"] = copy["Injury_2"].str.replace(r"\s{2,}", " ", regex=True)

In [612]:
copy["Injury_2"].value_counts().head(60
                                    )

fatal                                            1360
no injury                                         804
foot bitten                                       178
lacerations to foot                               139
leg bitten                                        120
lacerations to leg                                111
survived                                           97
lacerations to hand                                58
calf bitten                                        49
hand bitten                                        47
thigh bitten                                       46
lacerations to thigh                               45
no details                                         43
lacerations to calf                                41
arm bitten                                         41
lacerations to arm                                 37
injuries                                           37
ankle bitten                                       24
lacerations to forearm      

In [511]:
# Looking at the head and tail of the column to see what stands out:
copy["Injury_2"].value_counts().head(10)
# copy["Injury_2"].value_counts().tail(50)


fatal             815
no injury         758
foot bitten       178
leg bitten        109
survived           97
calf bitten        49
thigh bitten       46
hand bitten        46
no details         43
foot lacerated     43
Name: Injury_2, dtype: int64

In [None]:
# copy["Injury_2"] = copy["Injury_2"].str.replace

In [481]:
# Remove double whitespaces
copy["Injury_2"] = copy["Injury_2"].str.replace(r"\s{2,}", " ")

In [None]:
# Check column 'Injury' for missing values

# find_non_default_missing_values(attacks_clean, 'Injury', 'string')

# DONE for now