# Data Cleaning

## Import Tools & Data Frame

- Import: Pandas, Seaborn, Matplotlib, Numpy.
- Import: attacks.csv
- View DataFrame

In [1]:
#Tools Import

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams
pd.options.mode.chained_assignment = None

In [2]:
#Data Import 

df = pd.read_csv ('./data/attacks.csv', encoding='unicode_escape')

In [3]:
df

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,


## Create new Data Frame to clean

- Delete irrelevant columns.
- Revise DataFrame & remove further columns if need be. 
- View shape and sample data 


In [4]:
df_clean = pd.DataFrame(df)

In [5]:
df_clean.shape

(25723, 24)

In [6]:
#Drop irelavant columns 

df_clean = df_clean.drop(columns=["Unnamed: 23","Unnamed: 22","original order","Case Number.2","Case Number.1","href","href formula","pdf", "Investigator or Source", "Name"])



In [7]:
#We can see that the DF has 12 less columns 

df_clean.shape

(25723, 14)

In [8]:
#Revised DataFrame
df_clean

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,,,,
25719,,,,,,,,,,,,,,
25720,,,,,,,,,,,,,,
25721,,,,,,,,,,,,,,


- Interesting distinction between "Fatal (Y/N" & "Injury" in terms of fatalities 

In [9]:
df_clean["Fatal (Y/N)"].value_counts()

N          4293
Y          1388
UNKNOWN      71
 N            7
M             1
2017          1
N             1
y             1
Name: Fatal (Y/N), dtype: int64

In [10]:
df_clean["Injury"].value_counts()

FATAL                                                                                     802
Survived                                                                                   97
Foot bitten                                                                                87
No injury                                                                                  82
Leg bitten                                                                                 72
                                                                                         ... 
Cut foot, but injury caused by fishing line, not the shark                                  1
Left foot bitten after he accidentally stepped on the shark         PROVOKED INCIDENT       1
2 puncture wounds in left leg                                                               1
PROVOKED INCIDENT    Knee bitten by shark trapped in net                                    1
FATAL. "Shark bit him in half, carrying away the lower extre

In [11]:
#Drop further irelavant columns 

df_clean = df_clean.drop(columns=["Case Number","Type","Area","Location", "Time"])


In [12]:
#Revised DF

df_clean

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
0,25-Jun-2018,2018.0,USA,Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,USA,Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,USA,Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,AUSTRALIA,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,MEXICO,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,
25719,,,,,,,,,
25720,,,,,,,,,
25721,,,,,,,,,


In [13]:


df_clean['Date'].isna().sum()

19421

In [14]:
# Revised DF sample

df_clean.sample()

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
7618,,,,,,,,,


## Incomplete data

- Count number of NaN
- Create new DF to clean 
- Action Clean 

In [15]:
#Count of NaN under an entire DataFrame

df_clean.isnull().sum().sum()

182186

In [16]:
df_clean_nan = pd.DataFrame(df_clean)

In [17]:
#Removing rows with "all" nans

df_clean_nan.dropna(axis=0, inplace=True, how="all")

In [18]:
#Revised DF

df_clean_nan

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
0,25-Jun-2018,2018.0,USA,Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,USA,Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,USA,Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,AUSTRALIA,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,MEXICO,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,AUSTRALIA,Diving,M,,FATAL,Y,
6298,Before 1903,0.0,AUSTRALIA,Pearl diving,M,,FATAL,Y,
6299,1900-1905,0.0,USA,Swimming,M,,FATAL,Y,
6300,1883-1889,0.0,PANAMA,,M,,FATAL,Y,


In [19]:
df_clean_nan

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
0,25-Jun-2018,2018.0,USA,Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,USA,Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,USA,Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,AUSTRALIA,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,MEXICO,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,AUSTRALIA,Diving,M,,FATAL,Y,
6298,Before 1903,0.0,AUSTRALIA,Pearl diving,M,,FATAL,Y,
6299,1900-1905,0.0,USA,Swimming,M,,FATAL,Y,
6300,1883-1889,0.0,PANAMA,,M,,FATAL,Y,


In [20]:
year_counts = df_clean_nan["Year"].value_counts()
print(year_counts.to_string())

2015.0    143
2017.0    136
2016.0    130
2011.0    128
2014.0    127
0.0       125
2013.0    122
2008.0    122
2009.0    120
2012.0    117
2007.0    112
2006.0    103
2005.0    103
2010.0    101
2000.0     97
1960.0     93
1959.0     93
2003.0     92
2004.0     92
2001.0     92
2002.0     88
1962.0     86
1961.0     78
1995.0     76
1964.0     66
1999.0     66
1998.0     65
1996.0     61
1963.0     61
1966.0     58
1997.0     57
1993.0     56
1992.0     56
1994.0     56
1988.0     55
1958.0     54
2018.0     53
1989.0     53
1956.0     51
1965.0     51
1983.0     50
1981.0     49
1975.0     49
1967.0     48
1968.0     47
1950.0     43
1955.0     43
1954.0     42
1970.0     42
1942.0     41
1984.0     41
1957.0     41
1982.0     40
1986.0     39
1976.0     39
1974.0     38
1990.0     38
1991.0     38
1929.0     38
1985.0     37
1953.0     36
1980.0     35
1987.0     35
1972.0     35
1935.0     32
1951.0     32
1936.0     32
1949.0     31
1944.0     31
1947.0     30
1969.0     30
1937.0

In [21]:
df_clean_nan.drop(df_clean_nan[df_clean_nan.Year < 1900].index, inplace=True)
df_clean_nan = df_clean_nan[df_clean_nan['Year'].notna()]

In [22]:
df_clean_nan["Year"].value_counts()

2015.0    143
2017.0    136
2016.0    130
2011.0    128
2014.0    127
         ... 
1921.0     12
1917.0     11
1903.0     10
1901.0      9
1918.0      5
Name: Year, Length: 119, dtype: int64

In [23]:
df_clean_nan

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
0,25-Jun-2018,2018.0,USA,Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,USA,Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,USA,Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,AUSTRALIA,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,MEXICO,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...
5559,14-Jul-1900,1900.0,USA,Hunting seashells,M,,"Believed drowned. Uhlbrechts foot, and the pe...",,Questionable
5560,Late Jul-1900,1900.0,USA,,,,"No injury to occupants. They shot shark, then ...",N,
5561,28-Jan-1900,1900.0,AUSTRALIA,"Standing, gathering oysters",M,,Right posterior thigh bitten,N,
5562,Early 1900s,1900.0,USA,,M,,Severe abrasion when shark swam between his legs,N,


In [24]:
df_clean_nan["Year"] = df_clean_nan["Year"].astype("int")

In [25]:
df_clean_nan

Unnamed: 0,Date,Year,Country,Activity,Sex,Age,Injury,Fatal (Y/N),Species
0,25-Jun-2018,2018,USA,Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018,USA,Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018,USA,Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018,AUSTRALIA,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018,MEXICO,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...
5559,14-Jul-1900,1900,USA,Hunting seashells,M,,"Believed drowned. Uhlbrechts foot, and the pe...",,Questionable
5560,Late Jul-1900,1900,USA,,,,"No injury to occupants. They shot shark, then ...",N,
5561,28-Jan-1900,1900,AUSTRALIA,"Standing, gathering oysters",M,,Right posterior thigh bitten,N,
5562,Early 1900s,1900,USA,,M,,Severe abrasion when shark swam between his legs,N,


In [26]:
df_clean_nan["Species "].value_counts()

White shark                                           156
Shark involvement prior to death was not confirmed    105
Invalid                                                91
Shark involvement not confirmed                        86
Tiger shark                                            69
                                                     ... 
White shark, 6m [20']                                   1
2.1 m to 2.4 m [7' to 8'] shark                         1
3' blacktip shark                                       1
1 m  shark                                              1
Tiger shark, 3.9 m                                      1
Name: Species , Length: 1504, dtype: int64

In [32]:
df_clean_date = df_clean_nan(["Date"].str.lower().str.extract(r'-(\w{3})-'),inplace=True)

AttributeError: 'list' object has no attribute 'str'