<a href="https://colab.research.google.com/github/MevrouwHelderder/Assignments/blob/main/Assignment_Shark_Attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext google.colab.data_table

In [2]:
# Importing the essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# importing the dataframe
path = "/content/drive/MyDrive/attacks.csv"
attacks = pd.read_csv(path, encoding="ISO-8859-1")
# Dropping columns.
attacks_drop_columns = attacks.drop(
    columns=[
        "Date",
        "Year",
        "Country",
        "Area",
        "Location",
        "Name",
        "Sex ",
        "Time",
        "Investigator or Source",
        "pdf",
        "href formula",
        "href",
        "Case Number.1",
        "Case Number.2",
        "original order",
        "Unnamed: 22",
        "Unnamed: 23",
    ]
)
# Renaming columns.
attacks_renamed = attacks_drop_columns.rename(
    columns={"Case Number": "Case", "Fatal (Y/N)": "Outcome", "Species ": "Species", "Ages": "Age"}
)
# Dropping rows.
# Making a copy to prevent view vs copy issues later on.
relevant_columns = list(attacks_renamed.columns[1:])
attacks_drop_rows = attacks_renamed.dropna(subset=relevant_columns, how="all").copy()
# Preparing the functions for locating and adjusting the missing values.
def print_separator(sep, num, msg):
    print("\n")
    print(sep * num)
    print(f"{msg}")
    print(sep * num)


# TACTIC A: find unique values
def look_at_unique_values(column):
    unique_values_cutoff = 50
    unique_values = column.unique()
    num_unique_values = len(unique_values)
    if num_unique_values == len(column):
        print(f"Each value in the column is unique (total: {num_unique_values})")
    elif num_unique_values < unique_values_cutoff:
        print(f"Less than {unique_values_cutoff} unique values:")
        try:
            sorted = np.sort(unique_values)
            print("Values are sorted")
            display(list(sorted))
        except:
            print("Could not sort values")
            display(list(unique_values))
    else:
        print(
            f"More than {unique_values_cutoff} unique values (total: {num_unique_values})"
        )


# TACTIC B: look at the edges
def look_at_edges(df, column_name):
    # inner function
    def show_head_and_tail(values):
        num_items_to_slice = 10
        display(list(values)[:num_items_to_slice])
        display(list(values)[-num_items_to_slice:])

    column = df[column_name]
    unique_values = column.unique()
    try:
        sorted = np.sort(unique_values)
        print("Unique values sorted, head and tail:")
        show_head_and_tail(sorted)
    except TypeError as error:
        print(f"Could not sort values: {error}")
        print("..so let's try filtering NULL values and then sorting")
        print("..there could be a black sheep in the null values")
        non_null_uniques = df.loc[~df[column_name].isnull(), column_name].unique()
        sorted = np.sort(non_null_uniques)
        show_head_and_tail(sorted)


# TACTIC C: casting to a type to see if all the values match the needed type
def cast_to_type(column, maybe_type):
    try:
        column.astype(maybe_type)
        print(f"Casting to {maybe_type} was successful")
    except ValueError as error:
        print(f"Could not cast to {maybe_type}: {error}")


# TACTIC D: display the value count of the column
def value_count(column):
    display(column.value_counts(dropna=False))


# FUNCTION TO CHECK THE DATAFRAME FOR ALL FOUR TACTICS
def find_non_default_missing_values(df, column_name, maybe_type):
    long_separator_amount = 80
    short_separator_amount = 40
    # Print the header
    print_separator(
        "*",
        long_separator_amount,
        f'Finding non default missing values for column "{column_name}"',
    )
    print(f'Column "{column_name}" has datatype: {df.dtypes[column_name]}')
    column = df[column_name]
    # A
    print_separator("-", short_separator_amount, "A: Looking at unique values")
    look_at_unique_values(column)
    # B
    print_separator("-", short_separator_amount, "B: Sorting and looking at the edges")
    look_at_edges(df, column_name)
    # C
    print_separator("-", short_separator_amount, f"C: Casting to type: {maybe_type}")
    cast_to_type(column, maybe_type)
    # D
    print_separator(
        "-",
        short_separator_amount,
        "D: Looking at frequency\nAll default-NULL values will be bunched together as NaN",
    )
    value_count(column)
    print("\n")


# Function to replace non-default NULL values with default NULL values.
# ⚠️ Mutates df
def replace_value(df, column_name, missing_old, missing_new):
    df[column_name] = df[column_name].replace({missing_old: missing_new})


# Function to display the default NULL values in the column.
def display_default_null_values(df, column_name):
    nulls = df.loc[df[column_name].isnull()]
    print(f'Number of default null values in "{column_name}": {len(nulls)}')


# Easier to type
nat = np.datetime64("nat")


In [3]:
# Making a deep copy of the dataframe for more clarity while working on it
attacks_clean = attacks_drop_rows.copy(deep=True)

# Lowercase all strings and strip whitespace and/or quotationmarks around strings
attacks_clean = attacks_clean.applymap(lambda x: x.lower() if isinstance(x, str) else x)
attacks_clean = attacks_clean.applymap(lambda x: x.strip('" ') if isinstance(x, str) else x)

# Column: Species

First let me apologize for the amount stuff done that is probably technically not all needed for this assignment ;-)
I had a blast cleaning op this column and I used it to practice a lot of new skills.
Also: I recognise that in real life it would probably almost always be a waste of time to refine values that occur only a few times but I appreciated the practice ;-)

In [None]:
# Checking how everthing looks and what stands out: 

find_non_default_missing_values(attacks_clean, 'Species', 'string')

In [4]:
# First steps cleaning up:
# Goal:
# extract the species from the string where possible, change null values
# where needed


def tidy(x):
    if pd.isna(x):
        return None
    elif len(x.strip()) == 0:
        return None
    elif "shark" in x:
        return re.search(r"(\S+\s*)?shark", x).group()
    else:
        return f"check: {x}"


attacks_clean["Species"] = attacks_clean["Species"].apply(tidy)

# Things we can safely change to "no species confirmed":
no_species = [
    "invalid",
    "unidentified",
    "questionable",
    "possibly",
    "not confirmed",
    "unconfirmed",
    "doubtful",
    "captive",
    "unknown",
    "several",
    "colored",
]

# one or more digits followed by ' or " followed by
# zero or more ] followed by shark, whitespaces optional
inches = r'\d+\s*(["\']{1,})\s*\]*\s*shark'

# string containing two or less letters or digits or -
# followed by shark, whitespaces optional
small_string = r"^[a-z0-9-]{0,2}\s*shark$"

# lb or kg or foot followed by zero or more ] followed by shark, whitespaces optional
measurements = r"(kg|lb|foot)\s*\]*\s*shark"


def tidy_more(x):
    if x is not None and (
        any(word in x for word in no_species)
        or re.search(inches, x)
        or re.search(small_string, x)
        or re.search(measurements, x)
    ):
        return None
    else:
        return x


attacks_clean["Species"] = attacks_clean["Species"].apply(tidy_more)

# removing quotation marks and this weird little fellas that look the same but are different:  
def remove_weirdos(x):
    if x is None:
        return None
    else:
        return re.sub(r'["]+', "", x)

attacks_clean["Species"] = attacks_clean["Species"].apply(remove_weirdos)

# Checking all values that I previously marked as 'check'
mask_species = attacks_clean["Species"].str.contains("check", na=False)
attacks_clean[mask_species]

# Changing to the right species where possible
correct_species = [
    "blue pointer",
    "wobbegong",
    "whaler",
    "hammerhead",
    "porbeagle",
    "whitetip",
    "horn",
]

def correct_checks(x):
    if x is not None and ("check" in x):
        for word in correct_species:
            if word in x:
                return f"{word} shark"
        else:
            return None
    else:
        return x

attacks_clean["Species"] = attacks_clean["Species"].apply(lambda x: correct_checks(x))


# Last crumbs to clean up:
# attacks_clean['Species'].value_counts().head(50)
# attacks_clean['Species'].value_counts().tail(50)

useless = [
    "large shark",
    "female shark",
    "grey shark",
    "two shark",
    "the shark",
    "from shark",
    "little shark",
    "larger shark",
    "red shark",
    "young shark",
    "for shark",
    "metre shark",
    "juvenile shark",
    "gray shark",
    "finned shark",
]
attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(useless, None)
)

attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(
        ["seven-gill shark", "7-gill shark", "sevengill  shark"], "sevengill shark"
    )
)
attacks_clean["Species"] = attacks_clean["Species"].replace(
    dict.fromkeys(["black-tipped shark", "blacktip  shark"], "blacktip shark")
)
attacks_clean["Species"] = attacks_clean["Species"].replace(
    {"sand shark": "sandshark", "zambesi shark": "zambezi shark"}
)

# Those last values are probably useless but since there is no way to definitively know if they represent a real species or not I'll leave them as is for now.
# attacks_clean['Species'].value_counts().head(50)
# attacks_clean['Species'].value_counts().tail(50)

# Column: Case

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Case", "string")

In [None]:
# There are some duplicate values. 
# Checking what they mean: 

attacks_clean[attacks_clean.duplicated('Case', keep=False)]

# They might indicatie cases where more then one person was attacked. 
# The columns regarding location and time might help here if needed but since 
# there are few and they seem irrelevant to our questions I will leave them for now

# Column: Type

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Type", "string")

In [5]:
# Replace a few different values that all mean the same plus replace np.nan.
attacks_clean["Type"] = attacks_clean["Type"].replace({"boating" : "boat", "boatomg" : "boat", np.nan: None})


# Column: Activity

In [None]:
# Column Activity
# Checking how everthing looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Activity", "string")

In [6]:
# Replacing the some things that stand out right away
attacks_clean["Activity"] = attacks_clean["Activity"].replace({".": None, np.nan: None})

In [None]:
# Checking all none - unique values: 
attacks_clean[attacks_clean.duplicated("Activity", keep=False)]

# Looking at the head and tail of the column to see what stands out:
attacks_clean["Activity"].value_counts().head(60)
# attacks2["Activity"].value_counts().tail(50)

In [7]:
# First: a few different types of describing the same activity make up for a pretty
# big group.
# Let's group them together.
attacks_clean["Activity"] = attacks_clean["Activity"].replace(
    dict.fromkeys(["boogie boarding", "paddle boarding", "body-boarding", "body boarding", "paddle-boarding"], "bodyboarding")
)

In [8]:
# Taking the top 60 values and dividing the other values, where possible,
# into those categories.

top_activities = list(attacks_clean["Activity"].value_counts().head(60).index)

def activities_checks(x):
    if x is not None:
      for word in top_activities:
          if word in x:
              return word
      else:
          return x

attacks_clean["Activity"] = attacks_clean["Activity"].apply(lambda x: activities_checks(x))

In [None]:
# Quickly scanninghecking the rows that have a value count of 1 for column "Activity"
# to see if anything stands out

value_counts = attacks_clean["Activity"].value_counts()
mask_activity = attacks_clean["Activity"].isin(value_counts[value_counts == 1].index)
attacks_clean[mask_activity]

# There seem to be no activities that can be grouped together. Everything is 
# pretty subjective so for now I'll leave it.

# Column: Fatal

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attacks_clean, "Outcome", "string")

In [9]:
# Changing values to corresponding categories
attacks_clean["Outcome"] = attacks_clean["Outcome"].replace({"n" : "nonfatal", "y" : "fatal", "m" : "unknown", "2017": "unknown", np.nan : None})

# Column: Age

In [None]:
# Checking how everything looks and what stands out: 
find_non_default_missing_values(attack_next, "Age", "string")
# Since one of the questions is about children vs adults I will add a column
# with the corresponding age group



In [104]:
# Making temporary copy for easier work
attack_next = attacks_clean.copy(deep=True)

In [106]:
# change NaN to None
attack_next["Age"] = attack_next["Age"].replace({np.nan : None})

# convert non-null values to string
non_null = attack_next["Age"].notnull()
attack_next.loc[non_null, "Age"] = attack_next.loc[non_null, "Age"].astype(str)

In [109]:
all_strings = attack_next.loc[non_null, "Age"].dtype == 'object' and attack_next.loc[non_null, "Age"].apply(lambda x: isinstance(x, str)).all()
print(f"All values in Age column are strings: {all_strings}")

All values in Age column are strings: True


In [85]:
child = ["child"] + [str(i) for i in range(0, 13)]
adolescent = ["teen"] + ["teens"] +  [str(i) for i in range(13, 19)]
adult = ["adult"] + ["middle-age"] + ["(adult)"] [str(i) for i in range(19, 66)]
elderly = ["elderly"] + [str(i) for i in range(66, 120)]
checkwords = ["or", ">", "<", "?", "&"]

In [110]:
def categorize_age(x):
  if x is None:
    return x
  elif x in child:
    return "child"
  elif x in adolescent:
    return "adolescent"
  elif x in adult:
    return "adult"
  elif x in elderly:
    return "elderly"
  else:
    return "check: " + str(x)


attack_next.insert(4, "Age_group", attack_next["Age"].apply(categorize_age))

In [None]:
# KEEP FOR A BIT IN CASE SOMETHING GOES WRONG

# def categorize_age(x):
#   if x is None:
#     return x
#   if isinstance(x, int):
#     if x < 13:
#       return "child"
#     elif x < 18:
#       return "adolescent"
#     elif x < 65:
#       return "adult"
#     elif x >= 65:
#       return "elderly"
#     else: 
#       return x
#   elif isinstance(x, str):
#     if x in child:
#       return "child"
#     elif x in adolescent:
#         return "adolescent"
#     elif x in adult:
#         return "adult"
#     elif x in elderly:
#         return "elderly"
#     else:
#       return "check: " + x
#   else:
#     return "check: " + str(x)

# attack_next.insert(4, "Age_group", attack_next["Age"].apply(categorize_age))

In [None]:
"or", ">", "<", "?", "&"

In [111]:
attack_next

Unnamed: 0,Case,Type,Activity,Age,Age_group,Injury,Outcome,Species
0,2018.06.25,boat,paddling,57,adult,"no injury to occupant, outrigger canoe and pad...",nonfatal,white shark
1,2018.06.18,unprovoked,standing,11,child,minor injury to left thigh,nonfatal,
2,2018.06.09,invalid,surfing,48,adult,injury to left lower leg from surfboard skeg,nonfatal,
3,2018.06.08,unprovoked,surfing,,,minor injury to lower leg,nonfatal,
4,2018.06.04,provoked,diving,,,lacerations to leg & hand shark provoked incident,nonfatal,tiger shark
...,...,...,...,...,...,...,...,...
6297,nd.0005,unprovoked,diving,,,fatal,fatal,
6298,nd.0004,unprovoked,diving,,,fatal,fatal,
6299,nd.0003,unprovoked,swimming,,,fatal,fatal,
6300,nd.0002,unprovoked,,,,fatal,fatal,


In [61]:
# Checking all values that I previously marked as 'check'
mask_age = attack_next["Age_group"].str.contains("check", na=False)
attack_next[mask_age]

Unnamed: 0,Case,Type,Activity,Age,Age_group,Injury,Outcome,Species
1824,2002.09.27.b,provoked,fishing,,check:,left thumb lacerated provoked incident,nonfatal,blacktip shark
1825,2002.09.27.a,provoked,swimming,,check:,thigh lacerated provoked incident,nonfatal,tiger shark
3108,1978.12.29,unprovoked,,,check:,survived,nonfatal,
4052,1959.10.07,unprovoked,swimming,x,check: x,"fatal, died in lourenco marques hospital",fatal,
4390,1953.01.08,boat,fishing,make line green,check: make line green,"no injury to occupant, shark charged boat",nonfatal,
4447,1951.03.24,unprovoked,fishing,young,check: young,severe lacerations of chest & thigh,nonfatal,
4676,1943.05.14,sea disaster,hospital ship centaur torpedoed & sunk by the ...,f,check: f,fatal,fatal,
4886,1936.02.23,provoked,touching the mouth of a supposedly dead shark,young,check: young,finger bitten provoked incident,nonfatal,
5084,1929.01.27,unprovoked,swimming,,check:,"fatal, right buttock lacerated, left arm sever...",fatal,
5170,1925.03.10,unprovoked,swimming,a.m.,check: a.m.,fatal,fatal,


In [None]:
# if & or or: check

In [None]:
# Cleaning up the rest



In [None]:
# Check column 'Injury' for missing values

# find_non_default_missing_values(attacks_clean, 'Injury', 'string')

# DONE for now