**Importing libraries**

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

**DataFrame**

In [None]:
# Reading and encoding dataframe
attacks = pd.read_csv('/Users/victor/maria/ironhack_labs/attacks.csv', encoding='cp1252')
# Making a copy with which we can work
attacks_df = attacks.copy()

In [None]:
# Getting some basic info
attacks_df.info()

In [None]:
# Deleting complet empty rows
attacks_df.dropna(axis = 0, how = 'all', inplace = True)
# Reassigning the columns titles
attacks_df.columns = attacks_df.columns.str.capitalize().str.strip()
# Changing specific column name
attacks_df = attacks_df.rename(columns = {'Fatal (y/n)': 'Fatal'})
# Deleting further non-valuable columns
attacks_df = attacks_df.drop(attacks_df[attacks_df['Case number'] == '0'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Case number'] == 'xx'].index)
# Showing all data in rows and columns
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 24)

In [None]:
# Checking if there is any valid ID
print(attacks_df['Case number.1'].is_unique)
print(attacks_df['Original order'].is_unique)
print(attacks_df['Href'].is_unique)
print(attacks_df['Pdf'].is_unique)

In [None]:
# Checking there are not duplicated rows, before setting a new ID
duplicateRows = attacks_df[attacks_df.duplicated()]
print(duplicateRows)

In [None]:
# Deleting non-valuable columns for the analysis
attacks_df = attacks_df.drop(columns = ['Case number', 'Area', 'Name', 'Time', 'Investigator or source','Pdf','Href formula', 'Href', 'Case number.2', 'Original order', 'Unnamed: 22', 'Unnamed: 23'])
# Case number: is not unique, and doesn't give us relevant information
# Area: is a categorical variable with many different options, we will use Country instead
# Name: doesn't give us relevant info about the attack
# Time: is an incomplete variable
# Invesigation or source: doesn't give us relevant information about the attack
# Pdf: doesn't give us relevant info
# Href formula: doesn't give us relevant info
# Href: doesn't give us relevant info
# Case number.2: repeated column
# Original order: could be used to get the date easier, as well as Case Number, but we will use Date and Year instead
# Unnamed: 22: empty column
# Unnamed: 23: empty column

In [None]:
# Setting a new ID
attacks_df.index = [x for x in range(1, len(attacks_df.values)+1)]
# Giving a new name to the column ID
attacks_df.index.name = 'Attacks_ID'

In [None]:
attacks_df.info()

# Demographics

In [None]:
# Checking the status of the demographic columns
attacks_df[['Sex','Age']]

## Age

In [None]:
attacks_df.Age.describe()

In [None]:
attacks_df.Age.value_counts(dropna=False)

We have 640 valid data in Age, let's try to rise it a little bit

In [None]:
attacks_df.Age.unique()

**If we assume:**
- Teenagers are between 13 and 19; so the mean will be *16*
- Youngs are from 20 to 30; so the mean will be *25*
- Adults are from 31 to 65; so the mean is *48*
- Elderly are from 66 to 100+; the mean is around *88*

In [None]:
# Cleaning Age column in order to get valuable data
attacks_df['Age'] = attacks_df['Age'].astype(str)
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*een.*$)', '16')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*oun.*$)', '25')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*dul.*$)', '48')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*lderl.*$)', '88')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*middle-a.*$)', '37')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(^.*onth.*$)', '1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(.*)s.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^\s$', 'NaN')
attacks_df['Age'] = attacks_df.Age.str.replace('nan', 'NaN')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(.*)\s&.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(.*)\sor.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(.*)\sto.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'(.*)\sto.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^\s(\d\d)', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^(\d\d).*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^\D*(\d\d)', r'\1')
attacks_df['Age'].replace(('\xa0', 'NaN'), inplace = True)
attacks_df['Age'] = attacks_df.Age.str.replace(r'(\d)\s\s\s.*', r'\1')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^\s\s.*', r'NaN')
attacks_df['Age'] = attacks_df.Age.str.replace(r'^\D{1,}$', r'NaN')
attacks_df['Age'].replace(('2½', '25'), inplace = True)
attacks_df['Age'].replace(('6½', '65'), inplace = True)
attacks_df['Age'].fillna('NaN') 

In [None]:
# Checking we have the expected output
attacks_df.Age.unique()

In [None]:
attacks_df['Age'].describe()

In [None]:
attacks_df.Age.value_counts(dropna=False)

In [None]:
attacks_df.Age.value_counts(normalize=True).head(15)

In [None]:
# Getting basic statistics
attacks_df['Age'] = pd.to_numeric(attacks_df['Age'], errors = 'coerce')
print('Age mean is: ', attacks_df.Age.mean())
print('Age median is: ', attacks_df.Age.median())
print('Age mode is: ', attacks_df.Age.mode())

## Sex

In [None]:
attacks_df.Sex.describe()

In [None]:
attacks_df.Sex.value_counts(dropna=False)

In [None]:
# Checking the current column status
attacks_df.Sex.unique()

In [None]:
# Cleaning Sex column in order to get valuable data
attacks_df['Sex'].replace(('lli', 'NaN'), inplace = True)
attacks_df['Sex'].replace(('N', 'NaN'), inplace = True)
attacks_df['Sex'].replace(('.', 'NaN'), inplace = True)
attacks_df['Sex'] = attacks_df.Sex.str.replace(r'^(M)\s$', r'\1')
attacks_df['Sex'].fillna('NaN') 

In [None]:
# Checking we have the expected output
attacks_df.Sex.unique()

In [None]:
attacks_df.Sex.value_counts(normalize=True)

In [1]:
ages = attacks_df.Age.sort_values()
plot_ages = sns.displot(x = ages, 
            hue = attacks_df.Sex,
           bins = 30);
plot_ages.set_title('Shark attacks distribution by age and distinguished by sex')
plot_ages.set(xlabel = 'Age of the victims', 
                ylabel='Sex of the victims');

NameError: name 'attacks_df' is not defined

> <span style="color:green"> Top model 1: __Demographics__ </span>

>_Procedure_
>- General dataframe cleaning (empty rows, non-valuable columns, rename columns, setting an autoincremental ID)
>    - Finally we have 6302 informative cases to work with
>- Cleaning Age series (from 3471 filled rows distribution in 157 unique values, to non-empty cells, normalizing the values to 80 unique values)
>- Cleaning Sex series (from 5737 filled rows distributed in 6 unique values, to non-empty cells in two unique values)
 
>_Conclusions_
>- Most victicms of shark attacks were predominantly teenagers or youngs, around to 22 y.o. in both sexs.
>- Even if it seems that men are most likely to be attacked by sharks (88%), it's important to keep in mind that with this data it is not possible to know if, for example, women come less close to the beach or if they are more cautious


# Space-time Statistics

In [None]:
attacks_df[['Date','Location', 'Country']]

## Date

In [None]:
# Checking the current column status
attacks_df.Date.describe()

In [None]:
attacks_df.Date.value_counts(dropna=False)

In [None]:
attacks_df.Date.sort_values(ascending=False)

In [None]:
# Remove duplicated hyphen (-)
attacks_df['Date'] = attacks_df.Date.str.replace(r'(-)\1+', r'\1')
# Remove starting word "Reported "
attacks_df['Date'] = attacks_df.Date.str.lstrip('Reported ')
# Parse date colum to datetime format (and replace string entries with NaT)
attacks_df['Date'] = pd.to_datetime(attacks_df['Date'], errors = 'coerce')
# Remove all NaT values
attacks_df = attacks_df.drop(attacks_df[attacks_df['Date'].isna()].index)

In [None]:
# Checking we have the expected output
attacks_df['Date'].dt.month

In [None]:
attacks_df['Date'].isna().sum()

In [None]:
attacks_df['Months'] = attacks_df.Date.dt.month
attacks_df['Years'] = attacks_df.Date.dt.year

In [None]:
attacks_df.Years.value_counts(dropna=False).head(30)

## Location

In [None]:
attacks_df.Location.describe()

In [None]:
attacks_df.Location.value_counts(dropna=False)

In [None]:
# There are so many unique values, it's necessary to group them, or use the Country instead the Location
new_Location = attacks_df.Location.str.extract(r'(?P<Beach>.+),(?P<County>.+)')[['County', 'Beach']]
attacks_df.Location = new_Location.County

In [None]:
# Location column is not useful at the visual inspection level
# With 807 categories no graphic conclusion can be drawn instead you use a world heatmap
attacks_df.Location.describe()

In [None]:
# The locations with a higher shark attacks register are: 
attacks_df.Location.value_counts(dropna=False).head(11)

> Top model 2: __Space-time statistics__

>_Procedure_
>- Parse date colum to datetime format, allowing us to study year and month information independently
>- Cleaning Location series in order to get a clustered sample
 
>_Conclusions_
>- Attacks are mostly registered in Florida, Hawaii, South Africa and Australia
>- Most shark attacks happened during January and Jul-August, probably it could be driven by the summertime in different hemisphere
>- Shark attack records have been gradually increasing until our days, highlighting a peak in the 1960s

>_(These data is not normalized, so should be taked carefully as probably records have been increased  during last years by technology facilities)_

In [None]:
plot_months = sns.countplot(x = attacks_df['Months'],
                 palette = ['#432371',"#FAAE7B"])
plot_months.set(xlabel = 'Months', 
       ylabel='Attacks');

In [None]:
plot_years = sns.countplot(x = attacks_df.Years,
                           palette = 'Blues');
plot_years.set(xlabel = 'Years', 
       ylabel='Attacks registered');

# Free project

## Country

In [None]:
attacks_df.Country.describe()

In [None]:
attacks_df.Country.unique()

In [None]:
attacks_df['Country'].replace(('nan', 'NaN'), inplace = True)
attacks_df['Country'].fillna('NaN')
attacks_df['Country'] = attacks_df.Country.str.replace(r'^(.*)\s/.*', r'\1')
attacks_df['Country'] = attacks_df.Country.str.replace(r'^(.*)?', r'\1')
attacks_df['Country'] = attacks_df.Country.str.replace(r'^Between\s(.*)\s.*', r'\1')
attacks_df['Country'] = attacks_df.Country.str.replace(r'.*British*.', 'UNITED KINGDOM')
attacks_df['Country'] = attacks_df.Country.str.replace(r'.*BRITISH*.', 'UNITED KINGDOM')
attacks_df['Country'].replace(('PALESTINIAN TERRITORIES', 'PALESTINA'), inplace = True)
attacks_df['Country'] = attacks_df.Country.str.replace(r'.*TURKS*.', 'TURKEY')
attacks_df['Country'].replace(('UNITED ARAB EMIRATES (UAE)', 'UNITED ARAB EMIRATES'), inplace = True)
attacks_df['Country'].replace(('GULF OF ADEN', 'SOMALIA'), inplace = True)
attacks_df['Country'].replace(('ST. MAARTIN', 'NETHERLANDS'), inplace = True)
attacks_df['Country'].replace(('CEYLON (SRI LANKA)', 'SRI LANKA'), inplace = True)


In [None]:
attacks_df.Country.describe()

In [None]:
attacks_df.Country.value_counts(dropna=False).head(11)

## Type

In [None]:
attacks_df.Type.value_counts()

In [None]:
attacks_df['Type'] = attacks_df['Type'].astype(str)
attacks_df['Type'].replace(('', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Boat', 'NaN'), inplace = True)
attacks_df['Type'].replace(('nan', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Boatomg', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Boating', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Questionable', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Sea Disaster', 'NaN'), inplace = True)
attacks_df['Type'].replace(('Invalid', 'NaN'), inplace = True)

attacks_df = attacks_df.drop(attacks_df[attacks_df['Type'] == 'Boating'].index)

attacks_df['Type'].fillna('NaN') 

In [None]:
attacks_df.Type.value_counts(normalize = True)

In [None]:
sns.displot(x = attacks_df.Type);

## Activity

In [None]:
attacks_df.Activity.value_counts()

In [None]:
attacks_df['Activity'] = attacks_df['Activity'].astype(str)
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*diving*', 'Diving')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Diving.*', 'Diving')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*oogie.*', 'Body boarding')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Walking*', 'Standing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*wimming.*', 'Swimming')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*bathing.*', 'Swimming')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Bathing.*', 'Swimming')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*fish.*', 'Sailing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Fish.*', 'Sailing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*sail.*', 'Sailing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*surf.*', 'Surfing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Treading.*', 'Standing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'^.*Surf skiing.*', 'Surf-skiing')
attacks_df['Activity'] = attacks_df.Activity.str.replace(r'\w{13}', 'NaN')
attacks_df['Activity'].fillna('NaN') 

In [None]:
attacks_df.Activity.value_counts(dropna=False).head(15)

## Injury

In [None]:
attacks_df['Injury'].describe() 

In [None]:
attacks_df.Injury.value_counts().head(15)

In [None]:
attacks_df['Injury'] = attacks_df['Injury'].astype(str)
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*FATAL.*', 'FATAL')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*Fatal.*', 'FATAL')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*bitten.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*bitten.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*No injury.*', 'No injury')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*acerat.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*njur.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*evere.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*wound.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^.*brasio.*', 'Bitten')
attacks_df['Injury'] = attacks_df.Injury.str.replace(r'^\D{13}$', 'NaN')
attacks_df['Injury'].replace(('No details','NaN'), inplace = True)
attacks_df['Injury'].fillna('NaN') 

In [None]:
attacks_df['Injury'].describe() 

## Fatal

In [None]:
attacks_df['Fatal'].describe() 

In [None]:
attacks_df['Fatal'].unique()

In [None]:
attacks_df['Fatal'] = attacks_df['Fatal'].astype(str)
attacks_df['Fatal'].replace(('nan','NaN'), inplace = True)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == 'y'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == 'M'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == 'UNKNOWN'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == '2017'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == ' N'].index)
attacks_df = attacks_df.drop(attacks_df[attacks_df['Fatal'] == 'N '].index)

#attacks_df['Fatal'].replace(('y','Y'), inplace = True)
#attacks_df['Fatal'].replace(('M','NaN'), inplace = True)
#attacks_df['Fatal'].replace(('UNKNOWN','NaN'), inplace = True)
#attacks_df['Fatal'].replace(('2017','NaN'), inplace = True)
#attacks_df['Fatal'].replace((' N','N'), inplace = True)
#attacks_df['Fatal'].replace(('N ','N'), inplace = True)
attacks_df['Fatal'].fillna('NaN') 


In [None]:
attacks_df['Fatal'].value_counts() 

In [None]:
attacks_df.Fatal.unique()

In [None]:
attacks_df['Fatal'].value_counts(normalize = True) 

In [None]:
sns.displot(x = attacks_df.Fatal);

In [None]:
attacks_df.Fatal.value_counts(normalize=True)
# 24% of shark attacks are fatal

In [None]:
attacks_df.Species.describe()

In [None]:
attacks_df = attacks_df.drop(columns = ['Date', 'Year', 'Location', 'Species', 'Case number.1'])

In [None]:
# Saving clean dataframe
attacks_df.to_pickle("sharks_df.csv")