<a href="https://colab.research.google.com/github/MevrouwHelderder/Assignments/blob/main/Assignment_Shark_Attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext google.colab.data_table

In [2]:
# Importing the essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
# importing the dataframe
path = '/content/drive/MyDrive/attacks.csv'
attacks = pd.read_csv(path, encoding="ISO-8859-1")
# Dropping columns.
attacks_drop_columns = attacks.drop(columns=['Date', 'Year','Country', 'Area', 'Location','Name', 'Sex ', 'Time', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'])
# Renaming columns.
attacks_renamed = attacks_drop_columns.rename(columns={'Case Number': 'Case', 'Fatal (Y/N)': 'Fatal', 'Species ': 'Species' })
# Dropping rows.
# Making a copy to prevent view vs copy issues later on.
relevant_columns = list(attacks_renamed.columns[1:])
attacks_drop_rows = attacks_renamed.dropna(subset=relevant_columns, how='all').copy()
# Preparing the functions for locating and adjusting the missing values.
def print_separator(sep, num, msg):
  print("\n")
  print(sep * num)
  print(f"{msg}")
  print(sep * num)
# TACTIC A: find unique values
def look_at_unique_values(column): 
  unique_values_cutoff = 50
  unique_values = column.unique()
  num_unique_values = len(unique_values)
  if num_unique_values == len(column):
    print(f"Each value in the column is unique (total: {num_unique_values})")
  elif num_unique_values < unique_values_cutoff: 
    print(f"Less than {unique_values_cutoff} unique values:")
    try:
      sorted = np.sort(unique_values)
      print("Values are sorted")
      display(list(sorted))
    except:
      print("Could not sort values")
      display(list(unique_values))
  else:
    print(f"More than {unique_values_cutoff} unique values (total: {num_unique_values})")
# TACTIC B: look at the edges
def look_at_edges(df, column_name):
  # inner function
  def show_head_and_tail(values):
      num_items_to_slice = 10
      display(list(values)[:num_items_to_slice]) 
      display(list(values)[-num_items_to_slice:]) 
  column = df[column_name]
  unique_values = column.unique()
  try: 
      sorted = np.sort(unique_values)
      print("Unique values sorted, head and tail:")
      show_head_and_tail(sorted)
  except TypeError as error:
      print(f"Could not sort values: {error}")
      print("..so let's try filtering NULL values and then sorting")
      print("..there could be a black sheep in the null values")
      non_null_uniques = df.loc[~df[column_name].isnull(), column_name].unique()
      sorted = np.sort(non_null_uniques)
      show_head_and_tail(sorted)
# TACTIC C: casting to a type to see if all the values match the needed type
def cast_to_type(column, maybe_type):
  try:
    column.astype(maybe_type)
    print(f"Casting to {maybe_type} was successful")
  except ValueError as error:
    print(f"Could not cast to {maybe_type}: {error}")
# TACTIC D: display the value count of the column
def value_count(column):
  display(column.value_counts(dropna=False))
# FUNCTION TO CHECK THE DATAFRAME FOR ALL FOUR TACTICS
def find_non_default_missing_values(df, column_name, maybe_type):
  long_separator_amount = 80
  short_separator_amount = 40
  # Print the header
  print_separator("*", long_separator_amount, f"Finding non default missing values for column \"{column_name}\"")
  print(f"Column \"{column_name}\" has datatype: {df.dtypes[column_name]}")
  column = df[column_name]  
  # A
  print_separator("-", short_separator_amount, "A: Looking at unique values")
  look_at_unique_values(column)
  # B
  print_separator("-", short_separator_amount, "B: Sorting and looking at the edges")
  look_at_edges(df, column_name)
  # C
  print_separator("-", short_separator_amount, f"C: Casting to type: {maybe_type}")
  cast_to_type(column, maybe_type)
  # D
  print_separator("-", short_separator_amount, "D: Looking at frequency\nAll default-NULL values will be bunched together as NaN")
  value_count(column)
  print("\n")
# Function to replace non-default NULL values with default NULL values.
# ⚠️ Mutates df
def replace_value(df, column_name, missing_old, missing_new):
  df[column_name] = df[column_name].replace({missing_old: missing_new})
# Function to display the default NULL values in the column.
def display_default_null_values(df, column_name):
  nulls = df.loc[df[column_name].isnull()]
  print(f"Number of default null values in \"{column_name}\": {len(nulls)}")
# Easier to type
nat = np.datetime64('nat')
# Making a deep copy of the dataframe.
attacks_clean = attacks_drop_rows.copy(deep=True)
# Lowercase all strings and strip whitespace and/or quotationmarks around strings
attacks_clean = attacks_clean.applymap(lambda x: x.lower() if isinstance(x, str) else x)
attacks_clean = attacks_clean.applymap(lambda x: x.strip('" ') if isinstance(x, str) else x)
# Column Type
attacks_clean = attacks_clean.replace({'Type' : {np.nan : None, 'Boating' : 'Boat', 'Boatomg' : 'Boat'}})
# Column Activity
attacks_clean['Activity'] = attacks_clean['Activity'].replace({'.': None})

# Column: Species

First let me apologize for the amount stuff done that is probably technically not all needed for this assignment ;-)
I had a blast cleaning op this column and I used it to practice a lot of new skills.
Also: I recognise that in real life it would probably almost always be a waste of time to refine values that occur only a few times but I appreciated the practice ;-)

In [3]:
# First steps cleaning up: 
# Goal: 
# extract the species from the string where possible, change null values
# Steps: 
# If null value return: "no species confirmed"
# If there already is "no species confirmed": return the original value
# If the word shark is present: return that word plus the predecessing word
# If there is anything else: add "check: " and return the original value

def tidy(x):
  if pd.isna(x):
    return "no species confirmed"
  elif (len(x.strip()) == 0):
    return "no species confirmed"
  elif "shark" in x:
    return re.search(r"(\S+\s*)?shark", x).group()
  elif "no species confirmed" in x: 
    return x
  else:
    return f'check: {x}'

attacks_clean['Species'] = attacks_clean['Species'].apply(tidy)

# Things we can safely change to "no species confirmed":
no_species = ["invalid", "unidentified", "questionable", "possibly", "not confirmed", "unconfirmed", "doubtful", "captive", "unknown", "several", "colored"]

# one or more digits followed by ' or " followed by zero or more ] followed by shark, whitespaces optional
inches = r'\d+\s*(["\']{1,})\s*\]*\s*shark'

# string containing two or less letters or digits or - followed by shark, whitespaces optional
small_string = r'^[a-z0-9-]{0,2}\s*shark$'

# lb or kg or foot followed by zero or more ] followed by shark, whitespaces optional
measurements = r'(kg|lb|foot)\s*\]*\s*shark'

def tidy_more(x):
      if any(word in x for word in no_species) or re.search(inches, x)  or  re.search(small_string, x) or re.search(measurements, x):
        return "no species confirmed"
      else:
        return x

attacks_clean['Species'] = attacks_clean['Species'].apply(tidy_more)

# removing quotation marks and this weird little fellas that look the same but are different:  
attacks_clean['Species'] = attacks_clean['Species'].apply(lambda x: re.sub(r'["]+', '', x))

# Checking all values that I previously marked as 'check'
mask = attacks_clean.Species.str.contains("check").values
attacks_clean.loc[mask, "Species"]

# Changing to the right species where possible
correct_species = ['blue pointer', 'wobbegong', 'whaler', 'hammerhead', 'porbeagle', 'whitetip', 'horn']

def correct_checks(x):
  if 'check' in x:
    for word in correct_species: 
      if word in x:
        return f'{word} shark'
    else: 
      return "no species confirmed"
  else: 
    return x

attacks_clean['Species'] = attacks_clean['Species'].apply(lambda x:correct_checks(x) )

# Last crumbs to clean up:
# attacks_clean['Species'].value_counts().head(50)
# attacks_clean['Species'].value_counts().tail(50)

useless = ['large shark', 'female shark', 'grey shark', 'two shark', 'the shark', 'from shark', 'little shark', 'larger shark', 'red shark', 'young shark', 'for shark', 'metre shark', 'juvenile shark', 'gray shark' ]
attacks_clean['Species'] = attacks_clean['Species'].replace(dict.fromkeys(useless, 'no species confirmed'))

attacks_clean['Species'] = attacks_clean['Species'].replace(dict.fromkeys(['seven-gill shark','7-gill shark'], 'sevengill shark'))
attacks_clean['Species'] = attacks_clean['Species'].replace({'black-tipped shark': 'blacktip shark'})

# Those last values are probably useless but since there is no way to definitively know if they represent a real species or not I'll leave them as is for now.

In [None]:

 # Column fatal
# Check column 'Fatal' for missing values
# find_non_default_missing_values(attacks_clean, 'Fatal', 'bool')
# display(attacks_clean.loc[(attacks_clean['Fatal']== False) | (attacks_clean['Fatal']== True)])
# Replacing values that are clear about their meaning with their boolean equivalents 
# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace(dict.fromkeys(['y'], True))
# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace(dict.fromkeys(['n'], False))
# replace_value(attacks_clean, 'Fatal', 'y', True)
# replace_value(attacks_clean, 'Fatal', 'n', False)
# Change values that are not clear to None for now, might remove them later.
# df[column_name].replace({missing_old: missing_new})
# non_bool = attacks_clean.loc[~((attacks_clean['Fatal'] == False) | (attacks_clean['Fatal'] == True))]
# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace({missing_old: missing_new})
# attacks_clean['Fatal'] = attacks_clean.loc[~((attacks_clean['Fatal'] == False) | (attacks_clean['Fatal'] == True))] = None

In [None]:
# Column Age
# Check column 'Age' for missing values
# find_non_default_missing_values(attacks_clean, 'Age', 'string')
# Strip the leading/trailing whitespace and quotation marks.
# attacks_clean['Age'] = attacks_clean['Age'].str.strip('"\' \()\\')
# Replace \xa0 and np.nan with None
# attacks_clean['Age'] = attacks_clean['Age'].replace(dict.fromkeys(['\xa0',np.nan], None))
# There are a lot of unclear ages. Since we want to know if there is a difference 
# between children and adults I think it is best to divide them into adult and child
# Transform the column so all clear numericals are in the right categories
# attacks_clean.transform({'Age': str.capitalize, "price": lambda price: round(price * 1.1)})





---



---






#Take [this](https://www.kaggle.com/felipeesc/shark-attack-dataset) dataset. 
Use all the skills you've learned up until now to answer the following questions as well as possible. 

* What are the most dangerous types of sharks to humans? 
* Are children more likely to be attacked by sharks? 
* Are shark attacks where sharks were provoked more or less dangerous? 
* Are certain activities more likely to result in a shark attack? 

If you feel you can't answer a question based on the dataset alone, feel free to find other datasets and use them in answering the questions.

For each answer you give not only answer the question but also write about the assumptions you made in answering the question. If an assumption or decision possibly created a bias please write about this as well.

In [None]:
# # Importing the essentials
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import re

# %load_ext google.colab.data_table

In [None]:
# # importing the dataframe
# path = '/content/drive/MyDrive/attacks.csv'

In [None]:
# attacks = pd.read_csv(path, encoding="ISO-8859-1")

## **Part 1: Rows and columns**

**COLUMNS**: There are columns regarding the date, time and location that seem to be irrelevant to the questions asked.

However, I'm not sure if I might need them later on, for example during imputation.

For now I will remove all seemingly irrelevant columns but I might come back to this later.<br><br>
**ROWS**: There seem to be a lot of rows where all values or all values except the 'Case Number' is NaN.

Since only a case number but no other data is useless in this case let's remove those rows.

In [None]:
# # Dropping columns.

# attacks_drop_columns = attacks.drop(columns=['Date', 'Year','Country', 'Area', 'Location','Name', 'Sex ', 'Time', 'Investigator or Source', 'pdf', 'href formula', 'href',
#        'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
#        'Unnamed: 23'])

In [None]:
# # Checking the names of the columns to see if anything needs to be adjusted.

# attacks_drop_columns.columns

In [None]:
# # Renaming columns.

# attacks_renamed = attacks_drop_columns.rename(columns={'Case Number': 'Case', 'Fatal (Y/N)': 'Fatal', 'Species ': 'Species' })

# # Checking the names of the columns to see if everything went well.

# display (attacks_renamed.columns)

In [None]:
# # Dropping rows.
# # Making a copy to prevent view vs copy issues later on.

# relevant_columns = list(attacks_renamed.columns[1:])

# attacks_drop_rows = attacks_renamed.dropna(subset=relevant_columns, how='all').copy()


## **Part 2: Missing values**
Time to find default and non-default missing values. 
I will use the tactics and functions we also used in an earlier exercise. 

I realize those functions aren't my own but I figured it would be useless to change a bit and pretend it is my own when it is much more important to show I now how and when to use the functions.
So... credit to Winc and off to the next part!

In [None]:
# # Preparing the functions for locating and adjusting the missing values.

# def print_separator(sep, num, msg):
#   print("\n")
#   print(sep * num)
#   print(f"{msg}")
#   print(sep * num)

# # TACTIC A: find unique values
# def look_at_unique_values(column): 
#   unique_values_cutoff = 50
#   unique_values = column.unique()
#   num_unique_values = len(unique_values)
#   if num_unique_values == len(column):
#     print(f"Each value in the column is unique (total: {num_unique_values})")
#   elif num_unique_values < unique_values_cutoff: 
#     print(f"Less than {unique_values_cutoff} unique values:")
#     try:
#       sorted = np.sort(unique_values)
#       print("Values are sorted")
#       display(list(sorted))
#     except:
#       print("Could not sort values")
#       display(list(unique_values))
#   else:
#     print(f"More than {unique_values_cutoff} unique values (total: {num_unique_values})")

# # TACTIC B: look at the edges
# def look_at_edges(df, column_name):
#   # inner function
#   def show_head_and_tail(values):
#       num_items_to_slice = 10
#       display(list(values)[:num_items_to_slice]) 
#       display(list(values)[-num_items_to_slice:]) 
#   column = df[column_name]
#   unique_values = column.unique()
#   try: 
#       sorted = np.sort(unique_values)
#       print("Unique values sorted, head and tail:")
#       show_head_and_tail(sorted)
#   except TypeError as error:
#       print(f"Could not sort values: {error}")
#       print("..so let's try filtering NULL values and then sorting")
#       print("..there could be a black sheep in the null values")
#       non_null_uniques = df.loc[~df[column_name].isnull(), column_name].unique()
#       sorted = np.sort(non_null_uniques)
#       show_head_and_tail(sorted)

# # TACTIC C: casting to a type to see if all the values match the needed type
# def cast_to_type(column, maybe_type):
#   try:
#     column.astype(maybe_type)
#     print(f"Casting to {maybe_type} was successful")
#   except ValueError as error:
#     print(f"Could not cast to {maybe_type}: {error}")

# # TACTIC D: display the value count of the column
# def value_count(column):
#   display(column.value_counts(dropna=False))

# # FUNCTION TO CHECK THE DATAFRAME FOR ALL FOUR TACTICS
# def find_non_default_missing_values(df, column_name, maybe_type):
#   long_separator_amount = 80
#   short_separator_amount = 40

#   # Print the header
#   print_separator("*", long_separator_amount, f"Finding non default missing values for column \"{column_name}\"")

#   print(f"Column \"{column_name}\" has datatype: {df.dtypes[column_name]}")

#   column = df[column_name]  

#   # A
#   print_separator("-", short_separator_amount, "A: Looking at unique values")
#   look_at_unique_values(column)

#   # B
#   print_separator("-", short_separator_amount, "B: Sorting and looking at the edges")
#   look_at_edges(df, column_name)

#   # C
#   print_separator("-", short_separator_amount, f"C: Casting to type: {maybe_type}")
#   cast_to_type(column, maybe_type)

#   # D
#   print_separator("-", short_separator_amount, "D: Looking at frequency\nAll default-NULL values will be bunched together as NaN")
#   value_count(column)
#   print("\n")

# # Function to replace non-default NULL values with default NULL values.
# # ⚠️ Mutates df
# def replace_value(df, column_name, missing_old, missing_new):
#   df[column_name] = df[column_name].replace({missing_old: missing_new})

# # Function to display the default NULL values in the column.
# def display_default_null_values(df, column_name):
#   nulls = df.loc[df[column_name].isnull()]
#   print(f"Number of default null values in \"{column_name}\": {len(nulls)}")

# # Easier to type
# nat = np.datetime64('nat')

In [None]:
# Making a deep copy of the dataframe.
# I understand that it is usually not advised to make a copy but instead work in the original dataframe.
# However, for this assignment I would like to be able to use this later for reference. 
# Having a copy makes it easier to later on see what is changed and what not.

# attacks_clean = attacks_drop_rows.copy(deep=True)

In [None]:
# attacks_clean

In [None]:
# Displaying information on the columns, their types and the count of non-null values.

# attacks_clean.info()



In [None]:
# # Lowercase all strings and strip whitespace and/or quotationmarks around strings

# attacks_clean = attacks_clean.applymap(lambda x: x.lower() if isinstance(x, str) else x)
# attacks_clean = attacks_clean.applymap(lambda x: x.strip('" ') if isinstance(x, str) else x)

In [None]:
# Check column 'Case' for missing values

# find_non_default_missing_values(attacks_clean, 'Case', 'string')

# Check the non-unique values to see what is going on.

# attacks_clean[attacks_clean.duplicated('Case', keep=False)]

# They might indicatie cases where more then one person was attacked. 
# The columns regarding location and time might help here
# Since there are few and they seem irrelevant to our questions I will leave them for now

# DONE

In [None]:
# Check column 'Type' for missing values

# find_non_default_missing_values(attacks_clean, 'Type', 'string')

# Changing NaN to None and the different spellings of boat accidents to boat.

# attacks_clean = attacks_clean.replace({'Type' : {np.nan : None, 'Boating' : 'Boat', 'Boatomg' : 'Boat'}})

# DONE

In [None]:
# Check column 'Activity' for missing values

# find_non_default_missing_values(attacks_clean, 'Activity', 'string')

# Replacing with default values where needed
# attacks_clean['Activity'] = attacks_clean['Activity'].replace({'.': None})

# DONE


In [None]:
# Check column 'Injury' for missing values

# find_non_default_missing_values(attacks_clean, 'Injury', 'string')

# DONE for now

In [None]:
# Check column 'Species' for missing values

# find_non_default_missing_values(attacks_clean, 'Species', 'string')

In [None]:
# Condensing all values to only the species

# All unique values in this column: 
# species_old = attacks_clean['Species'].unique()
# species_old

In [None]:
# Thinking steps
# I need to fill a new column with: 
# if the word 'shark' occurs once: extract the word before 'shark' and display that plus 'shark'
# else: display 'needs looking at'


In [None]:
# Return nurse shark
# string = 'said to involve a grey nurse shark that leapt out of the water and  seized the boy but species identification is questionable'
# word7 = "(\S+\s)?shark" # nurse shark

# species = re.search(word7, string)
# species.group()

In [None]:
# Check column 'Fatal' for missing values

# find_non_default_missing_values(attacks_clean, 'Fatal', 'bool')

# display(attacks_clean.loc[(attacks_clean['Fatal']== False) | (attacks_clean['Fatal']== True)])

# Replacing values that are clear about their meaning with their boolean equivalents 

# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace(dict.fromkeys(['y'], True))
# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace(dict.fromkeys(['n'], False))
# replace_value(attacks_clean, 'Fatal', 'y', True)
# replace_value(attacks_clean, 'Fatal', 'n', False)

# Change values that are not clear to None for now, might remove them later.
# df[column_name].replace({missing_old: missing_new})

# non_bool = attacks_clean.loc[~((attacks_clean['Fatal'] == False) | (attacks_clean['Fatal'] == True))]
# attacks_clean['Fatal'] = attacks_clean['Fatal'].replace({missing_old: missing_new})
# attacks_clean['Fatal'] = attacks_clean.loc[~((attacks_clean['Fatal'] == False) | (attacks_clean['Fatal'] == True))] = None



In [None]:
# Check column 'Age' for missing values

# find_non_default_missing_values(attacks_clean, 'Age', 'string')

# Strip the leading/trailing whitespace and quotation marks.

# attacks_clean['Age'] = attacks_clean['Age'].str.strip('"\' \()\\')

# Replace \xa0 and np.nan with None
# attacks_clean['Age'] = attacks_clean['Age'].replace(dict.fromkeys(['\xa0',np.nan], None))

# There are a lot of unclear ages. Since we want to know if there is a difference 
# between children and adults I think it is best to divide them into adult and child

# Transform the column so all clear numericals are in the right categories
# attacks_clean.transform({'Age': str.capitalize, "price": lambda price: round(price * 1.1)})

