In [27]:
import pandas as pd
import numpy as np


In [28]:
shark_attacks = pd.read_csv('attacks.csv', encoding='cp1252')
shark_attacks.iloc[0:2].T

Unnamed: 0,0,1
Case Number,2018.06.25,2018.06.18
Date,25-Jun-2018,18-Jun-2018
Year,2018.0,2018.0
Type,Boating,Unprovoked
Country,USA,USA
Area,California,Georgia
Location,"Oceanside, San Diego County","St. Simon Island, Glynn County"
Activity,Paddling,Standing
Name,Julie Wolfe,Adyson McNeely
Sex,F,F


In [29]:
def print_separator(sep, num, msg):
  print("\n")
  print(sep * num)
  print(f"{msg}")
  print(sep * num)

In [30]:
def unique_values(df, column):
    print_separator('*', 50, "Unique Values")
    unique_vals = df[column].unique()
    try:
        sorted = np.sort(unique_vals)
        print('Sorting succesfull')
        display(list(sorted))
    except TypeError as error:
        print(f'Sorting failed: {error}')
        display(list(unique_vals))

In [31]:
def exploring_data(df, column):
    print_separator('-', 50, f'Checking column #{column}')
    unique_values(df, column)
    print_separator('*', 50, "Value counts")
    display(df[column].value_counts(dropna=False))

In [32]:
def cast_to_type(df, column, type):
  try:
    df[column].astype(type)
    print(f"Casting to {type} was successful")
  except ValueError as error:
    print(f"Could not cast to {type}: {error}")

In [33]:
#exploring_data(shark_attacks, 'Unnamed: 22')
#exploring_data(shark_attacks, 'Unnamed: 23')

#Columns Unnamed:23 and Unnamed: 22 contain mainly nan. Dropping from dataframe for irrelevance.
def clean_data():
    data = pd.read_csv('attacks.csv', encoding='cp1252')
    data = data.drop(columns=['Unnamed: 22', 'Unnamed: 23'])
    return data

Assumption / Bias: 
1. Name of Shark is in front of the word 'Shark|shark'.
2. The name of a shark doesn't contain a digit
3. The of the shark is only one word
3. One cell contains only one shark name
4. All NA are marked as unknown shark

**What are the most dangerous types of sharks to humans?** 

White shark, Tiger Shark, Bull shark, Nurse Shark, Whaler Shark

In [34]:
import re

shark_clean = clean_data()
display(shark_clean.columns)
shark_clean.rename(columns={'Species ': 'Species'}, inplace=True)
shark_clean['Name of Shark'] = shark_clean['Species'].str.extract('(\w+(?=\s+shark|Shark))', flags=re.IGNORECASE)
shark_clean['Name of Shark'].fillna('unknown shark', inplace=True)
shark_clean['Name of Shark'] = shark_clean['Name of Shark'].str.lower()
series_shark = shark_clean['Name of Shark'].loc[(~shark_clean['Name of Shark'].str.contains('\\d', regex=True)) & (shark_clean['Name of Shark'].str.len() > 2)]

dang_shark = pd.DataFrame(series_shark.value_counts().keys().tolist(), columns=['Shark'])
dang_shark['Count'] = series_shark.value_counts().values.tolist()

display(dang_shark.iloc[1:6])

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

Unnamed: 0,Shark,Count
1,white,634
2,tiger,260
3,bull,177
4,nurse,97
5,whaler,66


Assumption / Bias: 
1. There are no human aged >= 1000
2. Adults are 18 or older
3. Only one age per cell 

**are children more likely to be attacked by sharks?**

No, Adults are more likely to be attacked.

In [35]:
df_age = clean_data()[['Case Number', 'Age']].copy()
df_age['Normal'] = df_age['Age'].loc[(df_age['Age'].str.len() < 4)]

#casting everything to float
df_age['Normal'].replace({'40s': 40, '20s': 20, '60s':60, '30s': 30, '50s': 50, r'^\s*$': np.nan, '6½': 6.5, '20?':20, '>50':50, 'X': np.nan, 'F': np.nan}, inplace=True, regex = True)
#cast_to_type(df_age, 'Normal', float)
df_age['Normal'] = df_age['Normal'].astype(float)
# exploring_data(df_age, 'Normal')

#divide data into child or adult
df_age['Age Group'] = np.select([df_age['Normal'] >=18, df_age['Normal'] <18], ['Adult', 'Child'])
Child = pd.DataFrame(df_age['Age Group'].value_counts())

display(Child[1:])




Casting to <class 'float'> was successful


Unnamed: 0,Age Group
Adult,2560
Child,833
