# 1. Exploring the Dataset

In [297]:
import pandas as pd
import numpy as np

In [305]:
def get_data():
    
    #Importing CSV
    file_path = '/Users/mairagutierrez/Documents/Ironhack/PROJECTS/project--I/data/attacks.csv'
    
    # Try reading the file with a different encoding
    data = pd.read_csv(file_path, encoding='latin1')

    return data

data = get_data()
# data.head(5)

In [226]:
#Initial Data Shape
data.shape

(25723, 24)

In [227]:
#Type of data in the Dataset
data.dtypes

Case Number                object
Date                       object
Year                      float64
Type                       object
Country                    object
Area                       object
Location                   object
Activity                   object
Name                       object
Sex                        object
Age                        object
Injury                     object
Fatal (Y/N)                object
Time                       object
Species                    object
Investigator or Source     object
pdf                        object
href formula               object
href                       object
Case Number.1              object
Case Number.2              object
original order            float64
Unnamed: 22                object
Unnamed: 23                object
dtype: object

In [228]:
#Getting columns information to formulate research questions
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

## 2. Pre-processing

In [306]:
#Evaluating null values in the dataset to make an initial cleaning. Values are sorted in descending order.

def inspect_missing_values(data):

    missing_values = data.isnull().sum()
    missing_percent = (missing_values /len(data))*100
    missing_data = pd.DataFrame({"Total values missing": missing_values, "Percentage": missing_percent})
    missing_data = missing_data[missing_data["Total values missing"] > 0].sort_values(by = "Total values missing", ascending = False)
    
    return missing_data

missing_data = inspect_missing_values(data)
missing_data

Unnamed: 0,Total values missing,Percentage
Unnamed: 22,25722,99.996112
Unnamed: 23,25721,99.992225
Time,22775,88.539439
Species,22259,86.533453
Age,22252,86.50624
Sex,19986,77.697003
Activity,19965,77.615364
Location,19961,77.599813
Fatal (Y/N),19960,77.595926
Area,19876,77.26937


In [307]:
def clean_data(data):
    
    #dropping columns with 99% missing values and the ones that aren't relevant for my research
    cleaned_data = data.drop(columns = ['Unnamed: 22', 'Unnamed: 23','Time','Species ','Age', 'Name', 'Investigator or Source','href formula','pdf','href','original order', 'Case Number.1', 'Case Number.2','Case Number'], axis = 1)
    
    #dropping all raws with NaN in every column
    cleaned_data = cleaned_data.dropna(how="all")
    
    return cleaned_data

cleaned_data = clean_data(data)
cleaned_data

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Injury,Fatal (Y/N)
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,"No injury to occupant, outrigger canoe and pad...",N
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,Minor injury to left thigh,N
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,Injury to left lower leg from surfboard skeg,N
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Minor injury to lower leg,N
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,FATAL,Y
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,FATAL,Y
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,FATAL,Y
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,FATAL,Y


In [308]:
def standardize_fatal_column(data):
    
    #Changing the column name to a simpler one
    renamed_data = data.rename(columns= {'Fatal (Y/N)':'Fatal'})
    
    return renamed_data

cleaned_data = standardize_fatal_column(cleaned_data)
cleaned_data

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Injury,Fatal
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,"No injury to occupant, outrigger canoe and pad...",N
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,Minor injury to left thigh,N
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,Injury to left lower leg from surfboard skeg,N
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Minor injury to lower leg,N
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,FATAL,Y
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,FATAL,Y
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,FATAL,Y
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,FATAL,Y


In [309]:
def convert_date_format(data):
    
    #Converting the 'Date' column to datetime format
    data['Date'] = pd.to_datetime(data['Date'], errors = 'coerce')
    
    return data

cleaned_data = convert_date_format(cleaned_data)
cleaned_data

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Injury,Fatal
0,2018-06-25,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,"No injury to occupant, outrigger canoe and pad...",N
1,2018-06-18,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,Minor injury to left thigh,N
2,2018-06-09,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,Injury to left lower leg from surfboard skeg,N
3,2018-06-08,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Minor injury to lower leg,N
4,2018-06-04,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6297,NaT,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,FATAL,Y
6298,NaT,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,FATAL,Y
6299,NaT,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,FATAL,Y
6300,NaT,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,FATAL,Y


In [310]:
def casting_year(data):
    
    #Filling nan values with a placeholder = 0 and converting data type to integer
    data['Year'] = data['Year'].fillna(0).astype(int)
  
    return data

cleaned_data = casting_year(cleaned_data)
cleaned_data

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Injury,Fatal
0,2018-06-25,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,"No injury to occupant, outrigger canoe and pad...",N
1,2018-06-18,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,Minor injury to left thigh,N
2,2018-06-09,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,Injury to left lower leg from surfboard skeg,N
3,2018-06-08,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Minor injury to lower leg,N
4,2018-06-04,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6297,NaT,0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,FATAL,Y
6298,NaT,0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,FATAL,Y
6299,NaT,0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,FATAL,Y
6300,NaT,0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,FATAL,Y


## 3. Transforming the Data

In [None]:
def 

In [245]:
looking = cleaned_data['Year'].unique()
looking

array([2018., 2017.,   nan, 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2004., 2003., 2002.,
       2001., 2000., 1999., 1998., 1997., 1996., 1995., 1984., 1994.,
       1993., 1992., 1991., 1990., 1989., 1969., 1988., 1987., 1986.,
       1985., 1983., 1982., 1981., 1980., 1979., 1978., 1977., 1976.,
       1975., 1974., 1973., 1972., 1971., 1970., 1968., 1967., 1966.,
       1965., 1964., 1963., 1962., 1961., 1960., 1959., 1958., 1957.,
       1956., 1955., 1954., 1953., 1952., 1951., 1950., 1949., 1948.,
       1848., 1947., 1946., 1945., 1944., 1943., 1942., 1941., 1940.,
       1939., 1938., 1937., 1936., 1935., 1934., 1933., 1932., 1931.,
       1930., 1929., 1928., 1927., 1926., 1925., 1924., 1923., 1922.,
       1921., 1920., 1919., 1918., 1917., 1916., 1915., 1914., 1913.,
       1912., 1911., 1910., 1909., 1908., 1907., 1906., 1905., 1904.,
       1903., 1902., 1901., 1900., 1899., 1898., 1897., 1896., 1895.,
       1894., 1893.,

In [246]:
lenght = cleaned_data['Year'].nunique()
lenght

249

In [282]:
value_counts = cleaned_data['Year'].value_counts()
zero_counts = value_counts[0.0]
zero_counts

125

In [275]:
nan_count = cleaned_data['Year'].isna().sum()
nan_count

2

In [263]:
data = data.drop(drop_cols, axis =1)

NameError: name 'drop_cols' is not defined