# Data Cleaning Project

A data cleaning project from Iron Hack's Data Analysis bootcamp.

In [43]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import warnings
warnings.filterwarnings('ignore')

from IPython.display import HTML

In [2]:
# Reading csv. The original df is not going to be modified for later comparison

df_ori = pd.read_csv('../src/attacks.csv', encoding='latin1')

df = df_ori.copy()

### DataFrame reading

In [3]:
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [5]:
df.shape

(25723, 24)

In [6]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

### Normalization of column names

In [7]:
# list of original col names
ori_names = list(df.columns)

ori_names[:5]

['Case Number', 'Date', 'Year', 'Type', 'Country']

In [8]:
ori_names[-4]

'Case Number.2'

In [9]:
# first general modification of columns
better_names = []

for c in ori_names:
    splitted = c.split()
    res = '_'.join(splitted).lower()
    better_names.append(res)
    
better_names[:5]

['case_number', 'date', 'year', 'type', 'country']

In [10]:
# change 'fatal_(y/n)' to just fatal, the '.' in 'case_number.x' to '_' and the unnamed columns to unnamed_1 and _2
better_names[12] = 'fatal'

better_names[-5] = better_names[-5].replace('.', '_')
better_names[-4] = better_names[-4].replace('.', '_')

better_names[-2] = 'unnamed_1'
better_names[-1] = 'unnamed_2'

better_names

['case_number',
 'date',
 'year',
 'type',
 'country',
 'area',
 'location',
 'activity',
 'name',
 'sex',
 'age',
 'injury',
 'fatal',
 'time',
 'species',
 'investigator_or_source',
 'pdf',
 'href_formula',
 'href',
 'case_number_1',
 'case_number_2',
 'original_order',
 'unnamed_1',
 'unnamed_2']

In [11]:
better_names[-5]

'case_number_1'

In [12]:
df.columns = better_names

df.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,unnamed_1,unnamed_2
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


## Nul Values

In [13]:
def napercent():
    result = df.isna().mean().sort_values(ascending=False) * 100
    display(result[result > 0])
    
napercent()

unnamed_1                 99.996112
unnamed_2                 99.992225
time                      88.539439
species                   86.533453
age                       86.506240
sex                       77.697003
activity                  77.615364
location                  77.599813
fatal                     77.595926
area                      77.269370
name                      76.316915
country                   75.694903
injury                    75.609377
investigator_or_source    75.566614
type                      75.516075
year                      75.508300
href_formula              75.504412
date                      75.500525
pdf                       75.500525
href                      75.500525
case_number_1             75.500525
case_number_2             75.500525
original_order            75.473312
case_number               66.170353
dtype: float64

In [14]:
nan_cols = df.isna().sum()

nan_cols[nan_cols>0]

case_number               17021
date                      19421
year                      19423
type                      19425
country                   19471
area                      19876
location                  19961
activity                  19965
name                      19631
sex                       19986
age                       22252
injury                    19449
fatal                     19960
time                      22775
species                   22259
investigator_or_source    19438
pdf                       19421
href_formula              19422
href                      19421
case_number_1             19421
case_number_2             19421
original_order            19414
unnamed_1                 25722
unnamed_2                 25721
dtype: int64

In [15]:
#drop all columns with half or more the columns with nan values
df.dropna(thresh=len(df.columns) / 2, inplace=True)

In [16]:
df.shape, df_ori.shape

((6302, 24), (25723, 24))

In [17]:
napercent()

unnamed_1                 99.984132
unnamed_2                 99.968264
time                      53.221200
species                   45.033323
age                       44.922247
sex                        8.965408
activity                   8.632180
location                   8.568708
fatal                      8.552840
area                       7.219930
name                       3.332275
country                    0.793399
injury                     0.444303
investigator_or_source     0.269756
type                       0.063472
year                       0.031736
href_formula               0.015868
case_number                0.015868
dtype: float64

### Filling unnamed columns with 0s

In [18]:
df[['unnamed_1', 'unnamed_2']] = df[['unnamed_1', 'unnamed_2']].fillna(0)

### Checking specific columns

In [19]:
napercent()

time                      53.221200
species                   45.033323
age                       44.922247
sex                        8.965408
activity                   8.632180
location                   8.568708
fatal                      8.552840
area                       7.219930
name                       3.332275
country                    0.793399
injury                     0.444303
investigator_or_source     0.269756
type                       0.063472
year                       0.031736
href_formula               0.015868
case_number                0.015868
dtype: float64

In [19]:
# These are unknown, because we only know when it was reported
df[df['year'].isna() == True]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,unnamed_1,unnamed_2
187,2017.01.08.R,Reported 08-Jan-2017,,Invalid,AUSTRALIA,Queensland,,Spearfishing,Kerry Daniel,M,35.0,"No attack, shark made a threat display",,,Bull shark,Liquid Vision 1/8/2017,2017.01.08.R-KerryDaniel.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.01.08.R,2017.01.08.R,6116.0,0,0
6079,1836.08.19.R,Reported 19-Aug-1836,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,a boy,M,,FATAL,Y,,,"C. Moore, GSAF",1835.08.19.R-Whitehaven.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1836.08.19.R,1836.08.19.R,224.0,0,0


### Last Columns

Hypothesis: the columns with nan are most probably unknown values

In [50]:
df.fillna('unknown', inplace=True)

In [51]:
napercent()

Series([], dtype: float64)

## Check incorrect values