# Global Shark Attack Incidents Data Analysis

Using the data set in <https://www.kaggle.com/teajay/global-shark-attacks/> (Version 7)

After cleaning and analyzing the data, we want to answer the following questions:

* Attacks per year 1900+ (Total, Fatal, Non Fatal) ?


* Relation of provoked and unprovoked attacks/ fatal or non fatal (1900+)


* Relation of Man or Woman in Total attacks (1900+) 


## Import and cleaning data set

### Import libraries and data set's

In [None]:
import pandas as pd
import numpy as np
import requests

In [None]:
#read dataframe and create a backup
df = pd.read_csv('attacks.csv', sep = ',', encoding='latin-1')
df_backup = df.copy()

### Clean data frame

#### work with column names

In [None]:
columns_name = df.columns
# make remove spaces before and after and put all in lower case
columns_name = [item.strip().lower() for item in columns_name]
# replace spaces for underline
columns_name = [item.replace(' ', '_') for item in columns_name]
df.columns = columns_name

In [None]:
df.columns

#### Work with duplicate row's /columns with missing values

In [None]:
#drop duplicate rows
df.drop_duplicates(subset =df.columns, inplace = True)
df.shape

In [None]:
#count NaN in each row
df['count_missing'] = df.isnull().sum(axis=1)
#using a mask to eliminate rows with 20 missing values or more
mask = df['count_missing'] < 20
df = df.loc[mask,:]
#drop column use before
df.drop(labels='count_missing', axis=1, inplace=True)
df.shape

In [None]:
#Count missing values in Columns
df.isnull().sum()

In [None]:
#look for row's not in column's Unnamed: 22
df.loc[~df['unnamed:_22'].isnull(), :]

In [None]:
#look for row's not in column's Unnamed: 23
df.loc[~df['unnamed:_23'].isnull(), :]

In [None]:
#drop columns Unnamed
df.drop(labels=['unnamed:_22','unnamed:_23'], axis=1, inplace=True)

#### Work with the columns href and href_formula

In [None]:
#look for equals values
(df['href'] == df['href_formula']).value_counts()

In [None]:
#show the values where href is different than href_formula
df.loc[~(df['href'] == df['href_formula']), ['href', 'href_formula']]

In [None]:
# in the most of cases one column have a link, with a funcion try to see if one column have a good link

def clean_href(lst):
    """
    This funcion check if two values of list are equals, if not check if one of the values have a link if response is ok
    * In http a status code OK for get request is 200
    """
    x = lst[0]
    y = lst[1]
    try: 
        if x == y:
            return x
        else:
            x_code = requests.get(x, verify=True).status_code
            if x_code == 200:
                return x
            else:
                y_code = requests.get(y, verify=True).status_code
                if y_code == 200:
                    return y
                else:
                    return False
    except:
        return False
# make new column with a list of values from the column href and href_formula
df['href_new'] = list(zip(df['href'], df['href_formula']))
# apply the function clean_href to the new column
df['href_new'] = df['href_new'].apply(clean_href)
df['href_new']

In [None]:
#drop the columns href and href_formula from the df
df = df.drop(['href','href_formula'], axis=1)
df.head()

#### Work with 'Fatal (Y/N)' column

In [None]:
df['fatal_(y/n)'].unique()

In [None]:
#remove spaces before and after
df['fatal_(y/n)'] = df['fatal_(y/n)'].str.strip()
#lower all strings
df['fatal_(y/n)'] = df['fatal_(y/n)'].str.lower()
df['fatal_(y/n)'].unique()

In [None]:
#group other responses in missing value
df['fatal_(y/n)'] = df['fatal_(y/n)'].apply(lambda x: np.nan if x not in ['y','n'] else x)
df['fatal_(y/n)'].unique()

#### Work with type column

In [None]:
df['type'].unique()

In [None]:
# put all data in lower case
df["type"] = df["type"].str.lower()
# remove spaces before and after
df["type"] = df["type"].str.strip()
# classify as unprovoked any boating related
df["type"] = df["type"].apply(lambda x: "unprovoked" if x in ["boating", "sea disaster", "boat", "boatomg"] else x)
# set as NaN other types if not provoked or unprovoked
df["type"] = df["type"].apply(lambda x: np.nan if x not in ["provoked", "unprovoked"] else x)

df["type"].value_counts(dropna=False)

#### Work with 'Sex' column

In [None]:
df['sex'].unique()

In [None]:
#remove spaces before and after
df['sex'] = df['sex'].str.strip()
#lower all strings
df['sex'] = df['sex'].str.lower()
df['sex'].unique()

In [None]:
#group other responses in missing value
df['sex'] = df['sex'].apply(lambda x: np.nan if x not in ['f','m'] else x)
df['sex'].unique()

## Answer the questions

### Attacks per year 1900+ (Total, Fatal, Non Fatal) ?

#### Create a data frame with the values to answer the question

In [None]:
# create data frame only with columns we want, drop the other columns we don't need to this analysis
df_answer_1 = df[['year', 'fatal_(y/n)']]
# drop the lines with na
df_answer_1 = df_answer_1.dropna()
#filter year to 1900 +
df_answer_1 = df_answer_1.loc[df_answer_1['year'] >= 1900, :]
# create two columns fatals e non_fatals to sum after
df_answer_1['fatals'] = np.where(df_answer_1['fatal_(y/n)'] == 'y', True, False)
df_answer_1['non_fatals'] = np.where(df_answer_1['fatal_(y/n)'] == 'n', True, False)
#drop the original column
df_answer_1.drop(labels='fatal_(y/n)', axis=1, inplace = True)
#group by year and sum year
df_answer_1 = df_answer_1.groupby(by='year', as_index = False).sum()
#creating column total, sum of fatals and non_fatals
df_answer_1['Total'] = df_answer_1['fatals'] + df_answer_1['non_fatals'] 
#convert column year to int
df_answer_1['year'] =df_answer_1.astype(int)

#### Table of result's

In [None]:
pd.set_option('display.max_rows', 119)
df_answer_1

#### Plot result's

In [None]:
df_answer_1.plot(x ='year', y=['fatals','non_fatals'])

### Relation of provoked and unprovoked attacks/ fatal or non fatal (1900+)

#### Create a data frame with we need to answer the question

In [None]:
# create data frame only with columns we want, with drop the other columns we don't need to this analyse
df_answer_2 = df[['year', 'fatal_(y/n)', 'type']]
# drop the lines with na
df_answer_2 = df_answer_2.dropna()
#filter year to 1900 +
df_answer_2 = df_answer_2.loc[df_answer_2['year'] >= 1900, :]
# create four columns (fatals_provoked), (fatals_unprovoked), (non_fatals_provoked) and (non_fatals_unprovoked) to sum after
df_answer_2['fatals_provoked'] = np.where((df_answer_2['fatal_(y/n)'] == 'y') & (df_answer_2['type'] == 'provoked'), True, False)
df_answer_2['fatals_unprovoked'] = np.where((df_answer_2['fatal_(y/n)'] == 'y') & (df_answer_2['type'] == 'unprovoked'), True, False)
df_answer_2['non_fatals_provoked'] = np.where((df_answer_2['fatal_(y/n)'] == 'n') & (df_answer_2['type'] == 'provoked'), True, False)
df_answer_2['non_fatals_unprovoked'] = np.where((df_answer_2['fatal_(y/n)'] == 'n') & (df_answer_2['type'] == 'unprovoked'), True, False)
#drop the column original
df_answer_2.drop(labels=['fatal_(y/n)','year','type'] , axis=1, inplace = True)

#### Answer with the values 

In [None]:
#the answer with the sum per category
total = total.sort_values()
total

In [None]:
#normalize the answer 
total_perc = total.apply(lambda x: 100*x/total.sum())
total_perc = total_perc.sort_values()
total_perc

#### Plot the answer

In [None]:
total.plot(kind='bar')

### Relation of Man or Woman in Total attacks (1900+)

#### Create a data frame with we need to answer the question

In [None]:
# create data frame only with columns we want, with drop the other columns we don't need to this analyse
df_answer_3 = df[['year', 'sex']]
# drop the lines with na
df_answer_3 = df_answer_3.dropna()
#filter year to 1900 +
df_answer_3 = df_answer_3.loc[df_answer_3['year'] >= 1900, :]
#drop the column we don't need
df_answer_3.drop(labels='year', axis=1, inplace = True)

#### Answer with the values 

In [None]:
total = df_answer_3['sex'].value_counts()
total

In [None]:
#normalize the answer 
total_perc = total.apply(lambda x: 100*x/total.sum())
total_perc = total_perc.sort_values()
total_perc

#### Plot the answer

In [None]:
total.plot(kind='bar')