In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import regex

In [None]:
shark_attack= pd.read_csv('/Users/maria.perensin/Documents/Data Analytics Bootcamp/Week 7/Projeto/attacks.csv', encoding = 'cp1251')
shark_attack.head()

In [None]:
shark_attack.isna().sum(axis=0).sort_values(ascending=False).head()
shark_attack_cleaned=shark_attack.drop(['pdf', 'href', 'href formula', 'Unnamed: 22', 'Unnamed: 23', 'original order','Investigator or Source', 'Time', 'Name'], axis=1)

After checking empty/useless columns (for this analysis), I decided to drop a few of them as per code above. Below, I'll be droping empty/ useless rows.

In [None]:
shark_attack_cleaned = shark_attack_cleaned.drop_duplicates()

In [None]:
shark_attack_cleaned = shark_attack_cleaned.drop(25722)
shark_attack_cleaned = shark_attack_cleaned.drop(8702)
shark_attack_cleaned = shark_attack_cleaned.drop(6302)

In [None]:
shark_attack_cleaned.shape

Now, 'Country' and 'Activity' columns will be standardized by the function below. Afterwards, column 'CONTRY_CLEANED' and 'ACTIVITY_CLEANED' will be placed in the beggining of the table.

In [None]:
def clean_country(value):
    pattern= '[^A-Za-z]'
    pattern1 = ' '
    try:
        value=value.lstrip().rstrip()
        value=value.replace(pattern, '_')
        value=value.replace(pattern1, '_')
        value= value.replace(',', '_')
        return value.upper()

    except:
        return np.nan

In [None]:
shark_attack_cleaned['COUNTRY_CLEANED'] = shark_attack_cleaned['Country'].map(clean_country)

In [None]:
shark_attack_cleaned['ACTIVITY_CLEANED'] = shark_attack_cleaned['Activity'].map(clean_country)

In [None]:
first_column = shark_attack_cleaned.pop('COUNTRY_CLEANED')
shark_attack_cleaned.insert(0, 'COUNTRY_CLEANED', first_column)

In [None]:
second_column = shark_attack_cleaned.pop('ACTIVITY_CLEANED')
shark_attack_cleaned.insert(1, 'ACTIVITY_CLEANED', second_column)

Now all cells that contain the words 'surf' or 'swim' will be replaced by the words 'surfing' or 'swiming'. All the other rows will be deleted as they are not interesting to this analysis.

In [None]:
def clean_activity_type(activity):
    pattern1 = 'SURF'
    pattern2 = 'SWIM'
    try:
        if pattern1 in activity:
            return 'SURFING'
        elif pattern2 in activity:
            return 'SWIMING'
        else:
            return np.nan
    except:
        return np.nan

In [None]:
shark_attack_cleaned['ACTIVITY_CLEANED'] = shark_attack_cleaned['ACTIVITY_CLEANED'].map(clean_activity_type)

In [None]:
shark_attack_cleaned = shark_attack_cleaned.dropna(subset = ['ACTIVITY_CLEANED'])

Now, I'll review what country had the highest number of fatal cases.

In order to achive that, a mask will be used:

fatal_or_not - in which the code accepts 'Yes' only, described by 'Y' in the main source.

In [None]:
fatal_or_not = shark_attack_cleaned['Fatal (Y/N)']== 'Y'
activity_per_country = shark_attack_cleaned[fatal_or_not].groupby(['ACTIVITY_CLEANED','COUNTRY_CLEANED']).count()

In [None]:
pd.set_option('display.max_rows',None)
activity_per_country = activity_per_country[['Fatal (Y/N)']].sort_values(ascending=False, by=['ACTIVITY_CLEANED','Fatal (Y/N)'])

In [None]:
activity_per_country.head(100)

In [None]:
dict_activity_per_country_swim = pd.DataFrame({'COUNTRY': ['AUSTRALIA', 'USA','SOUTH_AFRICA','BRAZIL'], 'ACTIVITIES': [82, 65, 45, 21]})
dict_activity_per_country_surf = pd.DataFrame({'COUNTRY': ['AUSTRALIA','SOUTH_AFRICA','USA','BRAZIL'], 'ACTIVITIES': [24, 13, 11, 4]})

In [None]:
swim_chart = dict_activity_per_country_swim.plot.bar(x='COUNTRY', y='ACTIVITIES', rot=0)
surf_chart = dict_activity_per_country_surf.plot.bar(x='COUNTRY', y='ACTIVITIES', rot=0)

Based on this analysis, we can assume that, in total, the higher number of fatal accidents happened in Australia.

Below, we are going to analyse when each attack (fatal or not) happened anytime after 1990.

In [None]:
mask_australia_years = (shark_attack_cleaned['COUNTRY_CLEANED'] == 'AUSTRALIA') & (shark_attack_cleaned['Year']>=1990.0)

australia_activity_per_year = shark_attack_cleaned[mask_australia_years].groupby(['ACTIVITY_CLEANED', 'Year']).count()

In [None]:
australia_activity_per_year = australia_activity_per_year[['COUNTRY_CLEANED']].sort_values(ascending=False, by=['ACTIVITY_CLEANED','COUNTRY_CLEANED'])

In [None]:
australia_activity_per_year.head(30)

In [None]:
shark_attack_cleaned[mask_australia_years].groupby('Area')['Area'].count().sort_values(ascending= False)

In [None]:
mask_south_wales = shark_attack_cleaned['Area'] == 'New South Wales'

We now know that New South Wales is the place where attacks happen the most in Australia - now it's time to choose what beach sould have a shark net installed. So we'll be looking for bay beaches.

In [None]:
shark_attack_cleaned[mask_south_wales].groupby('Location')['Location'].count().sort_values(ascending = False)

In [None]:
def is_bay(place):
    pattern = 'BAY'
    try:
        if pattern in place.upper():
            return place
        else:
            return 'Not in a bay' 
    except:
        return 'Not in a bay' 

In [101]:
shark_attack_cleaned['LOCATION_CLEANED'] = shark_attack_cleaned['Location'].map(is_bay)

In [102]:
mask_in_bay = shark_attack_cleaned['LOCATION_CLEANED'] != 'Not in a bay'

In [103]:
shark_attack_cleaned['LOCATION_CLEANED'].nunique()

167

We know that 167 locations are in bay areas. 

In [104]:
bay_decision = shark_attack_cleaned[['LOCATION_CLEANED']].groupby('LOCATION_CLEANED')['LOCATION_CLEANED'].count()

In [105]:
bay_decision_australia = shark_attack_cleaned[mask_south_wales][['LOCATION_CLEANED']]

In [108]:
shark_attack_cleaned[mask_in_bay & mask_australia].groupby('LOCATION_CLEANED')['LOCATION_CLEANED'].count()

LOCATION_CLEANED
Alma Bay, Magnetic Island, Townsville                                   1
Arakoon's Little Bay                                                    1
Bateman's Bay                                                           1
Belongil Beach, Byron Bay                                               1
Between Bay Rock & Magnetic Island, Cleveland Bay                       1
Binalong Bay                                                            1
Blue Bay                                                                1
Boarding School Bay                                                     1
Byron Bay                                                               2
Byron Bay                                                               2
Cheviot Beach, Portsea, Port Phillip Bay                                1
Chimney Point, George’s Bay                                             1
Clarkes Beach, Byron Bay                                                1
Clarks Beach, Byron B

The location name will now be standardized by the function below.

In [109]:
def clean_location(value):
    pattern= '[^A-Za-z]'
    pattern1 = ' '
    try:
        value=value.lstrip().rstrip()
        value=value.replace(pattern, '_')
        value=value.replace(pattern1, '_')
        value= value.replace(',', '_')
        return value.upper()

    except:
        return np.nan

In [110]:
shark_attack_cleaned['LOCATION_CLEANED'] = shark_attack_cleaned['LOCATION_CLEANED'].map(clean_location)

In [111]:
shark_attack_cleaned[mask_in_bay & mask_australia].groupby('LOCATION_CLEANED')['LOCATION_CLEANED'].count()

LOCATION_CLEANED
ALMA_BAY__MAGNETIC_ISLAND__TOWNSVILLE                                   1
ARAKOON'S_LITTLE_BAY                                                    1
BATEMAN'S_BAY                                                           1
BELONGIL_BEACH__BYRON_BAY                                               1
BETWEEN_BAY_ROCK_&_MAGNETIC_ISLAND__CLEVELAND_BAY                       1
BINALONG_BAY                                                            1
BLUE_BAY                                                                1
BOARDING_SCHOOL_BAY                                                     1
BYRON_BAY                                                               4
CHEVIOT_BEACH__PORTSEA__PORT_PHILLIP_BAY                                1
CHIMNEY_POINT__GEORGE’S_BAY                                             1
CLARKES_BEACH__BYRON_BAY                                                1
CLARKS_BEACH__BYRON_BAY                                                 1
COFFIN_BAY           

After standardizing location names and analysing the results, I could noticed the word 'Byron Bay' is listed more frequently than any other location. Based on that, I decided to confirm my visual assumption by tracking results containing 'Byron Bay'.

In [115]:
def is_byron_bay(value):
    try:
        if 'BYRON_BAY' in value:
            return 'BYRON_BAY'
        else:
            return value
    except:
        return value

In [116]:
shark_attack_cleaned['LOCATION_CLEANED'] = shark_attack_cleaned['LOCATION_CLEANED'].map(is_byron_bay)

In [122]:
shark_attack_cleaned[mask_in_bay & mask_australia].groupby('LOCATION_CLEANED')['LOCATION_CLEANED'].count()

LOCATION_CLEANED
ALMA_BAY__MAGNETIC_ISLAND__TOWNSVILLE                                    1
ARAKOON'S_LITTLE_BAY                                                     1
BATEMAN'S_BAY                                                            1
BETWEEN_BAY_ROCK_&_MAGNETIC_ISLAND__CLEVELAND_BAY                        1
BINALONG_BAY                                                             1
BLUE_BAY                                                                 1
BOARDING_SCHOOL_BAY                                                      1
BYRON_BAY                                                               12
CHEVIOT_BEACH__PORTSEA__PORT_PHILLIP_BAY                                 1
CHIMNEY_POINT__GEORGE’S_BAY                                              1
COFFIN_BAY                                                               1
COOGEE_BAY__NEAR_SYDNEY                                                  1
CORIO_BAY__PORT_PHILLIP                                                  1
COWARAMU

By far, 'Byron Bay' seems to be the place with the highest number of shark attacks in Australia (the country with the highest rate of fatal shark attacks). That being said, this was the place chosen by my 'company' to install protection nets.