# 2. Cleaning the Dataset

## 2.1 Preliminar Cleaning

In [1372]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from fuzzywuzzy import process
import pycountry


In [1373]:
def get_data():
    
    #Importing CSV
    file_path = '/Users/mairagutierrez/Documents/Ironhack/PROJECTS/project--I/data/attacks.csv'
    
    # Try reading the file with a different encoding
    data = pd.read_csv(file_path, encoding='latin1')

    return data
    
data = get_data()
data.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,


In [1374]:
def pre_cleaning(data):
    
    #dropping columns with 99% missing values and the ones that aren't relevant for my research
    pre_cleaned_data = data.drop(columns = ['Unnamed: 22', 'Unnamed: 23', 'Case Number.1', 'Case Number.2'], axis = 1)
    
    #dropping all rows with NaN in every column
    pre_cleaned_data = pre_cleaned_data.dropna(how="all")
    
    #Remove spaces in column titles
    pre_cleaned_data.columns = pre_cleaned_data.columns.str.replace(' ','_')
    
    return pre_cleaned_data

pre_cleaned_data = pre_cleaning(data)
pre_cleaned_data.head(3)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0


In [1375]:
#New data columns names without spaces
pre_cleaned_data.columns

Index(['Case_Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex_', 'Age', 'Injury', 'Fatal_(Y/N)', 'Time',
       'Species_', 'Investigator_or_Source', 'pdf', 'href_formula', 'href',
       'original_order'],
      dtype='object')

In [1376]:
cleaned_data = data.copy()

## 2.2 Cleaning Year 

In [1377]:
null_values_count = cleaned_data['Year'].isnull().sum()
null_values_count

19423

In [1378]:
# Casting Year data type from float to integer
pre_cleaned_data['Year'] = pre_cleaned_data['Year'].fillna(0).astype(int) 
cleaned_data = pre_cleaned_data

# Filter dataframe with data over 1900 as this is data relevant to my research
cleaned_data = pre_cleaned_data[(cleaned_data['Year'] >= 1900) & (cleaned_data['Year']<= 2018) ]
cleaned_data.head(3)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0


## 2.3 Cleaning Case Number

In [1380]:
#Removing any letter at the end of the date and any spaces.
cleaned_data['Case_Number'] = cleaned_data['Case_Number'].str.strip().str.replace(r'[A-Za-z]$', '', regex=True)

# Extracting the year from the 'Case_Number' column and creating a new column 'Extracted_Year'
#cleaned_data = cleaned_data.assign(Extracted_Year=cleaned_data['Case_Number'].str[:4])

# Extracting the month from the 'Case_Number' column
cleaned_data = cleaned_data.assign(Extracted_Month=cleaned_data['Case_Number'].str.extract(r'\.(\d{2})\.'))


# Reordering columns 
cleaned_data = cleaned_data[['Case_Number','Date', 'Year','Extracted_Month','Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex_', 'Age', 'Injury', 'Fatal_(Y/N)', 'Time','Species_', 'Investigator_or_Source', 'pdf', 'href_formula', 'href','original_order']]

cleaned_data.head(3)                                                              

Unnamed: 0,Case_Number,Date,Year,Extracted_Month,Type,Country,Area,Location,Activity,Name,...,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order
0,2018.06.25,25-Jun-2018,2018,6,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,...,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018.06.18,18-Jun-2018,2018,6,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,...,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018.06.09,09-Jun-2018,2018,6,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,...,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0


## 2.4 Cleaning Type

In [1381]:
#TYPE
null_values_type = cleaned_data['Type'].isnull().sum()
nat_count_type = cleaned_data['Type'].isna().sum()
uniq_val_type = cleaned_data['Type'].unique()
num_uniq_type = cleaned_data['Type'].nunique()
print(null_values_type, nat_count_type, uniq_val_type, num_uniq_type)


3 3 ['Boating' 'Unprovoked' 'Invalid' 'Provoked' 'Questionable' 'Sea Disaster'
 nan 'Boat' 'Boatomg'] 8


In [1382]:
# 1) Removing spaces at the beginning and the end of the word
cleaned_data['Type'] = cleaned_data['Type'].str.strip()

# 2) Replacing null values with most common values
most_common_value = cleaned_data['Type'].mode()[0]
cleaned_data['Type'] = cleaned_data['Type'].fillna(most_common_value)

# 3) Converting all values to lower case
cleaned_data['Type'] = cleaned_data['Type'].str.lower()

# 4) Grouping the values by specified categories
cleaned_data['Type'] = cleaned_data['Type'].replace({
    r'.*boat.*': 'boating',  # Including all values that have "boat" anywhere in the string
    'invalid': 'invalid',
    'provoked': 'provoked',
    'questionable': 'invalid',
    'unprovoked': 'unprovoked',
    'sea disaster': 'sea disaster'
}, regex=True)

cleaned_data['Type'] = cleaned_data['Type'].str.capitalize()
cleaned_data.head(3)


Unnamed: 0,Case_Number,Date,Year,Extracted_Month,Type,Country,Area,Location,Activity,Name,...,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order
0,2018.06.25,25-Jun-2018,2018,6,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,...,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018.06.18,18-Jun-2018,2018,6,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,...,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018.06.09,09-Jun-2018,2018,6,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,...,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0


## 2.5 Cleaning Country

In [1383]:
#AREA
null_values_country = cleaned_data['Type'].isnull().sum()
nat_count_area = cleaned_data['Area'].isna().sum()
num_uniq_area = cleaned_data['Area'].nunique()
print(null_values_country, nat_count_area, num_uniq_area)

0 313 719


In [1384]:
#COUNTRY
nat_count_country = cleaned_data['Country'].isna().sum()
uniq_val_country = cleaned_data['Country'].unique()
num_uniq_country = cleaned_data['Country'].nunique()
print(nat_count_country, uniq_val_country, num_uniq_country)

24 ['USA' 'AUSTRALIA' 'MEXICO' 'BRAZIL' 'ENGLAND' 'SOUTH AFRICA' 'THAILAND'
 'COSTA RICA' 'MALDIVES' 'BAHAMAS' 'NEW CALEDONIA' 'ECUADOR' 'MALAYSIA'
 'LIBYA' nan 'CUBA' 'MAURITIUS' 'NEW ZEALAND' 'SPAIN' 'SAMOA'
 'SOLOMON ISLANDS' 'JAPAN' 'EGYPT' 'ST HELENA, British overseas territory'
 'COMOROS' 'REUNION' 'FRENCH POLYNESIA' 'UNITED KINGDOM'
 'UNITED ARAB EMIRATES' 'PHILIPPINES' 'INDONESIA' 'CHINA' 'COLUMBIA'
 'CAPE VERDE' 'Fiji' 'DOMINICAN REPUBLIC' 'CAYMAN ISLANDS' 'ARUBA'
 'MOZAMBIQUE' 'FIJI' 'PUERTO RICO' 'ITALY' 'ATLANTIC OCEAN' 'GREECE'
 'ST. MARTIN' 'FRANCE' 'PAPUA NEW GUINEA' 'TRINIDAD & TOBAGO' 'KIRIBATI'
 'ISRAEL' 'DIEGO GARCIA' 'TAIWAN' 'JAMAICA' 'PALESTINIAN TERRITORIES'
 'GUAM' 'SEYCHELLES' 'BELIZE' 'NIGERIA' 'TONGA' 'SCOTLAND' 'CANADA'
 'CROATIA' 'SAUDI ARABIA' 'CHILE' 'ANTIGUA' 'KENYA' 'RUSSIA'
 'TURKS & CAICOS' 'UNITED ARAB EMIRATES (UAE)' 'AZORES' 'SOUTH KOREA'
 'MALTA' 'VIETNAM' 'MADAGASCAR' 'PANAMA' 'SOMALIA' 'NEVIS'
 'BRITISH VIRGIN ISLANDS' 'NORWAY' 'SENEGAL' 'YEMEN'

In [1385]:
# Step 1: Replace NaN values with 'Unknown'
cleaned_data['Country'] = cleaned_data['Country'].fillna('Unknown')

# Step 2: Extract everything before "/"
cleaned_data['Country'] = cleaned_data['Country'].str.split('/').str[0]

# Step 3: Trim spaces, remove "?", and capitalize
cleaned_data['Country'] = cleaned_data['Country'].str.strip().str.replace('?', '').str.lower()


In [1386]:
uniq_val_country = cleaned_data['Country'].unique()
uniq_val_country

array(['usa', 'australia', 'mexico', 'brazil', 'england', 'south africa',
       'thailand', 'costa rica', 'maldives', 'bahamas', 'new caledonia',
       'ecuador', 'malaysia', 'libya', 'unknown', 'cuba', 'mauritius',
       'new zealand', 'spain', 'samoa', 'solomon islands', 'japan',
       'egypt', 'st helena, british overseas territory', 'comoros',
       'reunion', 'french polynesia', 'united kingdom',
       'united arab emirates', 'philippines', 'indonesia', 'china',
       'columbia', 'cape verde', 'fiji', 'dominican republic',
       'cayman islands', 'aruba', 'mozambique', 'puerto rico', 'italy',
       'atlantic ocean', 'greece', 'st. martin', 'france',
       'papua new guinea', 'trinidad & tobago', 'kiribati', 'israel',
       'diego garcia', 'taiwan', 'jamaica', 'palestinian territories',
       'guam', 'seychelles', 'belize', 'nigeria', 'tonga', 'scotland',
       'canada', 'croatia', 'saudi arabia', 'chile', 'antigua', 'kenya',
       'russia', 'turks & caicos', 'united 

In [1387]:
from fuzzywuzzy import process
import pycountry

def match_country(country):
    # Get a list of all country names from pycountry
    country_names = [c.name for c in pycountry.countries]
    
    # Find the closest match to the input country name
    matched_country = process.extractOne(country, country_names, score_cutoff=80)  # Adjust the score_cutoff as needed
    
    # Return the matched country name
    return matched_country[0] if matched_country else country

# Apply the matching function to the 'Country' column
cleaned_data['Country'] = cleaned_data['Country'].apply(match_country)

In [1388]:
cleaned_data.head(3)

Unnamed: 0,Case_Number,Date,Year,Extracted_Month,Type,Country,Area,Location,Activity,Name,...,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order
0,2018.06.25,25-Jun-2018,2018,6,Boating,usa,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,...,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018.06.18,18-Jun-2018,2018,6,Unprovoked,usa,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,...,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018.06.09,09-Jun-2018,2018,6,Invalid,usa,Hawaii,"Habush, Oahu",Surfing,John Denges,...,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0


## 2.6 Cleaning Activity

In [1389]:
#ACTIVITY
nat_count_activity = cleaned_data['Activity'].isna().sum()
uniq_val_activity = cleaned_data['Activity'].unique()
num_uniq_activity = cleaned_data['Activity'].nunique()
print(nat_count_activity, uniq_val_activity, num_uniq_activity)

435 ['Paddling' 'Standing' 'Surfing' ... 'Hunting seashells' ' '
 'Standing, gathering oysters'] 1325


In [1390]:
# Step 1: Fill NaN values
cleaned_data['Activity'] = cleaned_data['Activity'].fillna('Unknown')

# Step 2: Extract string before "/"
cleaned_data['Activity'] = cleaned_data['Activity'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x)

# Step 3: Standardize text
cleaned_data['Activity'] = cleaned_data['Activity'].str.lower().str.strip()

# Step 4: Group activities
activity_mapping = {
    'surf|boarding|padd': 'surfing',
    'swimming': 'swimming',
    'fishing': 'fishing',
    'diving': 'diving',
    'boat|sail': 'sailing',
}
for key, value in activity_mapping.items():
    cleaned_data['Activity'] = np.where(cleaned_data['Activity'].str.contains(key), value, cleaned_data['Activity'])

# Step 5: Grouping for bathing-related activities
bathing_keywords = ['bathing', 'standing', 'walking', 'wading','splashing','treading water', 'floating','jump', 'dangling','playing']
cleaned_data['Activity'] = np.where(cleaned_data['Activity'].str.contains('|'.join(bathing_keywords)), 'bathing', cleaned_data['Activity'])


# Displaying a sample of the dataframe to verify the changes
cleaned_data[['Activity']].head(3)



Unnamed: 0,Activity
0,surfing
1,bathing
2,surfing


In [1391]:
# Contar la frecuencia de cada valor en la columna 'Activity'
value_counts = cleaned_data['Activity'].value_counts()

# Displaying top 10 activities with more attacks
top_10_activities = value_counts.head(10)
top_10_activities

Activity
surfing         1405
fishing         1040
swimming         946
bathing          515
diving           457
unknown          438
snorkeling        88
sailing           79
kayaking          40
sea disaster      13
Name: count, dtype: int64

## 2.7 Cleaning Sex

In [1392]:
#SEX
nat_count_sex = cleaned_data['Sex_'].isna().sum()
uniq_val_sex = cleaned_data['Sex_'].unique()
num_uniq_sex = cleaned_data['Sex_'].nunique()
print(nat_count_sex, uniq_val_sex, num_uniq_sex)

501 ['F' 'M' nan 'M ' 'lli' 'N' '.'] 6


In [1393]:
if cleaned_data['Sex_'].isin(['Female', 'Male', 'Unknown']).all():
    print("Data has already been cleaned.")
else:
    # Step 1: Fill NaN values
    cleaned_data['Sex_'] = cleaned_data['Sex_'].fillna('Unknown')

    # Step 2: Standardize text
    cleaned_data['Sex_'] = cleaned_data['Sex_'].str.lower()

    # Step 3: Group values
    cleaned_data['Sex_'] = cleaned_data['Sex_'].replace({'f': 'Female', 'm': 'Male'})
    cleaned_data['Sex_'] = cleaned_data['Sex_'].apply(lambda x: x if x in ['Female', 'Male'] else 'Unknown')

    # Displaying a sample of the dataframe to verify the changes
    print(cleaned_data[['Sex_']].head())


     Sex_
0  Female
1  Female
2    Male
3    Male
4    Male


## 2.8 Cleaning Age

In [1394]:
#AGE
nat_count_age = cleaned_data['Age'].isna().sum()
uniq_val_age = cleaned_data['Age'].unique()
num_uniq_age = cleaned_data['Age'].nunique()
print(nat_count_age, uniq_val_age, num_uniq_age)

2183 ['57' '11' '48' nan '18' '52' '15' '12' '32' '10' '21' '34' '30' '60' '33'
 '29' '54' '41' '37' '56' '19' '25' '69' '38' '55' '35' '46' '45' '14'
 '40s' '28' '20' '24' '26' '49' '22' '7' '31' '17' '40' '13' '42' '3' '8'
 '50' '16' '82' '73' '20s' '68' '51' '39' '58' 'Teen' '47' '61' '65' '36'
 '66' '43' '60s' '9' '72' '59' '6' '27' '64' '23' '71' '44' '62' '63' '70'
 '18 months' '53' '30s' '50s' 'teen' '77' '74' '28 & 26' '5' '86'
 '18 or 20' '12 or 13' '46 & 34' '28, 23 & 30' 'Teens' '36 & 26' '8 or 10'
 '84' '\xa0 ' ' ' '30 or 36' '6½' '21 & ?' '75' '33 or 37' 'mid-30s'
 '23 & 20' ' 30' '7      &    31' ' 28' '20?' "60's" '32 & 30' '16 to 18'
 '87' '67' 'Elderly' 'mid-20s' 'Ca. 33' '74 ' '45 ' '21 or 26' '20 ' '>50'
 '18 to 22' 'adult' '9 & 12' '? & 19' '9 months' '25 to 35' '23 & 26' '1'
 '(adult)' '33 & 37' '25 or 28' '37, 67, 35, 27,  ? & 27' '21, 34,24 & 35'
 '30 & 32' '50 & 30' '17 & 35' 'X' '"middle-age"' '13 or 18' '34 & 19'
 '33 & 26' '2 to 3 months' 'MAKE LINE GREEN' ' 

In [1395]:
# Remove all spaces (leading, trailing, and in the middle)
cleaned_data['Age'] = cleaned_data['Age'].str.replace(' ', '', regex=True)

# Replace values with "month" to '1'
cleaned_data['Age'] = cleaned_data['Age'].replace(r'.*month.*', '1', regex=True)

# if there is an s next to a digit then leave the digit
cleaned_data['Age'] = cleaned_data['Age'].str.replace(r'(\d)s', r'\1', regex=True)

# If a value contains "to", "&", or "or", conserve just the first two digits of that value
cleaned_data['Age'] = cleaned_data['Age'].str.replace(r'(\d{1,2})\s*(to|&|or|,)\s*\d+', r'\1', regex=True)

# Fill NaN values in 'Age' column with 'Unknown'
cleaned_data['Age'].fillna('Unknown', inplace=True)


# Replace all values that aren't one or two digit numbers with "Invalid"
cleaned_data['Age'] = cleaned_data['Age'].apply(lambda x: x if x.isdigit() and len(x) <= 2 else 'Invalid')


uniq_val_age = cleaned_data['Age'].unique()
uniq_val_age


array(['57', '11', '48', 'Invalid', '18', '52', '15', '12', '32', '10',
       '21', '34', '30', '60', '33', '29', '54', '41', '37', '56', '19',
       '25', '69', '38', '55', '35', '46', '45', '14', '40', '28', '20',
       '24', '26', '49', '22', '7', '31', '17', '13', '42', '3', '8',
       '50', '16', '82', '73', '68', '51', '39', '58', '47', '61', '65',
       '36', '66', '43', '9', '72', '59', '6', '27', '64', '23', '71',
       '44', '62', '63', '70', '1', '53', '77', '74', '5', '86', '84',
       '75', '87', '67', '81', '78'], dtype=object)

In [1396]:
# Convert 'Age' column to integers, replacing "Invalid" with pd.NA
cleaned_data['Age'] = pd.to_numeric(cleaned_data['Age'], errors='coerce')
cleaned_data['Age'].head()


0    57.0
1    11.0
2    48.0
3     NaN
4     NaN
Name: Age, dtype: float64

## 2.9 Cleaning Fatal

In [1397]:
#FATAL
nat_count_fatal = cleaned_data['Fatal_(Y/N)'].isna().sum()
uniq_val_fatal = cleaned_data['Fatal_(Y/N)'].unique()
num_uniq_fatal = cleaned_data['Fatal_(Y/N)'].nunique()
print(nat_count_fatal, uniq_val_fatal, num_uniq_fatal)

464 ['N' 'Y' nan 'M' 'UNKNOWN' '2017' ' N' 'N '] 7


In [1398]:
# 1) Remove all spaces and convert to lowercase
cleaned_data['Fatal_(Y/N)'] = cleaned_data['Fatal_(Y/N)'].str.replace(' ', '').str.lower()

# 2) Replace specific unwanted values with NaN
cleaned_data['Fatal_(Y/N)'].replace({'m': np.nan, 'unknown': np.nan, '2017': np.nan}, inplace=True)

# 3) Group all "n" under "No", all "y" under "Yes", and the rest under "Unknown"
cleaned_data['Fatal_(Y/N)'] = cleaned_data['Fatal_(Y/N)'].replace({'n': 'No', 'y': 'Yes'}).fillna('Unknown')

# Display unique values in the 'Fatal (Y/N)' column after transformation
cleaned_data['Fatal_(Y/N)'].unique()

array(['No', 'Yes', 'Unknown'], dtype=object)

## 2.10 Cleaning Injury

In [1399]:
#INJURY
nat_count_injury = cleaned_data['Injury'].isna().sum()
uniq_val_injury = cleaned_data['Injury'].unique()
num_uniq_injury = cleaned_data['Injury'].nunique()
print(nat_count_injury, uniq_val_injury, num_uniq_injury)

19 ['No injury to occupant, outrigger canoe and paddle damaged'
 'Minor injury to left thigh'
 'Injury to left lower leg from surfboard skeg' ...
 'No injury to occupants. They shot shark, then it capsized their skiff. PROVOKED INCIDENT'
 'Right posterior thigh bitten'
 'Severe abrasion when shark swam between his legs'] 3461


In [1400]:
# Step 1: Filling missing values in 'Injury' column with 'unknown'
cleaned_data['Injury'] = cleaned_data['Injury'].fillna('unknown')

# Step 2: Converting all text in 'Injury' column to lower case
cleaned_data['Injury'] = cleaned_data['Injury'].str.lower()

# Displaying unique values and number of unique values after these steps
uniq_val_injury_after = cleaned_data['Injury'].unique()
num_uniq_injury_after = cleaned_data['Injury'].nunique()

num_uniq_injury_after, uniq_val_injury_after[:10]  


(3446,
 array(['no injury to occupant, outrigger canoe and paddle damaged',
        'minor injury to left thigh',
        'injury to left lower leg from surfboard skeg',
        'minor injury to lower leg',
        'lacerations to leg & hand shark provoked incident',
        'no injury, board bitten', 'fatal',
        'minor injury to foot. provoked incident', 'lower left leg bitten',
        'minor injury to foot'], dtype=object))

In [1401]:
import string

# Step 3: Remove Punctuation
cleaned_data['Injury'] = cleaned_data['Injury'].str.translate(str.maketrans('', '', string.punctuation))

# Step 4: Remove Extra Whitespace
cleaned_data['Injury'] = cleaned_data['Injury'].str.strip()
cleaned_data['Injury'] = cleaned_data['Injury'].str.replace(r'\s+', ' ', regex=True)

# Displaying unique values and number of unique values after these steps
uniq_val_injury_after = cleaned_data['Injury'].unique()
num_uniq_injury_after = cleaned_data['Injury'].nunique()

num_uniq_injury_after, uniq_val_injury_after[:10]  

(3332,
 array(['no injury to occupant outrigger canoe and paddle damaged',
        'minor injury to left thigh',
        'injury to left lower leg from surfboard skeg',
        'minor injury to lower leg',
        'lacerations to leg hand shark provoked incident',
        'no injury board bitten', 'fatal',
        'minor injury to foot provoked incident', 'lower left leg bitten',
        'minor injury to foot'], dtype=object))

In [1402]:
# Updating the categorization function to consider the 'Fatal_(Y/N)' column for categorizing fatal injuries
def categorize_injury(row):
    if row['Fatal_(Y/N)'].lower() == 'yes':
        return 'Fatal'
    text = str(row['Injury'])
    if 'no injury' in text:
        return 'No Injury'
    elif 'minor injury' in text or 'minor damage' in text:
        return 'Minor Injury'
    elif 'laceration' in text:
        return 'Laceration'
    elif 'bitten' in text or 'bite' in text:
        return 'Bitten'
    elif 'unknown' in text:
        return 'Unknown'
    else:
        return 'Other non-fatal attacks'

# Applying the updated categorization function to the 'Injury' column
cleaned_data['Injury_Category'] = cleaned_data.apply(categorize_injury, axis=1)

# Displaying the distribution of the injury categories after refactoring
injury_category_distribution = cleaned_data['Injury_Category'].value_counts()
injury_category_distribution


Injury_Category
Other non-fatal attacks    1652
Bitten                     1199
Fatal                      1026
Laceration                  791
No Injury                   754
Minor Injury                115
Unknown                      25
Name: count, dtype: int64

## 2.11 Cleaning Species

In [1403]:
#SPECIES
nat_count_species = cleaned_data['Species_'].isna().sum()
uniq_val_species = cleaned_data['Species_'].unique()
num_uniq_species = cleaned_data['Species_'].nunique()
print(nat_count_species, uniq_val_species, num_uniq_species)

2282 ['White shark' nan '2 m shark' ...
 'Fishermen recovered partial remains from shark a week later'
 "1.8 m to 2.7 m [6' to 9'] shark" 'Tiger shark, 3.9 m'] 1504


In [1405]:
# Step 1: Filling missing values in 'Species_' column with 'unknown'
cleaned_data['Species_'] = cleaned_data['Species_'].fillna('unknown')

# Step 2: Converting all text in 'Species_' column to lower case
cleaned_data['Species_'] = cleaned_data['Species_'].str.lower()

# Displaying unique values and number of unique values after these steps
uniq_val_species_after = cleaned_data['Species_'].unique()
num_uniq_species_after = cleaned_data['Species_'].nunique()

num_uniq_species_after, uniq_val_species_after[:10]  # Displaying first 10 unique values for brevity

(1490,
 array(['white shark', 'unknown', '2 m shark', 'tiger shark, 3m',
        'tiger shark', "lemon shark, 3'", "bull shark, 6'",
        'grey reef shark', 'invalid incident', 'tawny nurse shark, 2m'],
       dtype=object))

In [1424]:
top_10_species = cleaned_data['Species_'].value_counts().head(10)
top_10_species

Species_
unknown                                               2283
white shark                                            157
shark involvement prior to death was not confirmed     105
invalid                                                 91
shark involvement not confirmed                         87
tiger shark                                             69
bull shark                                              49
4' shark                                                40
6' shark                                                38
questionable incident                                   37
Name: count, dtype: int64

In [1516]:
# Defining the groups based on specific strings
groups = {
    "4' shark": ["4'"],
    "5' shark": ["5'"],
    "6' shark": ["6'"],
    "8' shark": ["8'"],
    "10' shark": ["10'"],
    "12' shark": ["12'"]
}

# Defining a function to assign groups based on the presence of specific strings
def assign_group(text):
    text = str(text).lower()
    for group, strings in groups.items():
        for string in strings:
            if string in text:
                return group
    return text  # Return the original text if no specific string is found

# Applying the function to the 'Species_' column
cleaned_data['Species_Grouped'] = cleaned_data['Species_'].apply(assign_group)

# Displaying unique values and number of unique values after grouping
uniq_val_species_grouped = cleaned_data['Species_Grouped'].unique()
num_uniq_species_grouped = cleaned_data['Species_Grouped'].nunique()

num_uniq_species_grouped, uniq_val_species_grouped[:50]


(834,
 array(['white shark', 'unknown', '2 m shark', 'tiger shark, 3m',
        'tiger shark', "lemon shark, 3'", "6' shark", 'grey reef shark',
        'invalid incident', 'tawny nurse shark, 2m',
        'shark involvement not confirmed', 'questionable', '3 m shark',
        'white shark, 3.5 m', 'white shark, 2.5 m', 'juvenile bull shark',
        'bull shark', "12' shark", 'wobbegong shark', '3.5 m shark',
        '1.8 m shark', 'blacktip shark',
        'juvenile white shark,  2.7 to 3.2 m', 'bull shark, 2 m',
        'possibly a wobbegong',
        'injury believed caused by an eel, not a shark',
        'galapagos shark?', '2m shark', 'bull shark, 3 m ',
        'grey reef shark. 2 m', 'small shark', 'wobbegong shark?',
        'juvenile nurse shark', "5' shark", 'tiger shark, female',
        'some drowned but other may have been killed by blue sharks',
        'white shark, 4.6 m', 'cookiecutter shark', 'wobbegong shark, 1 m',
        'white shark, 4.5 m', 'spinner shark, 4 to

In [1518]:
# Step 3: Defining a function to extract shark species from the text

def extract_shark_species(text):
    known_species = [
        'white shark', 'tiger shark', 'bull shark', 'blacktip shark', 'nurse shark',
        'wobbegong', 'lemon shark', 'grey reef shark', 'hammerhead', 'whaler shark',
        'reef shark', 'mako shark', 'blue shark', 'sand tiger shark', 'carpet shark',
        'zambesi shark', 'raggedtooth shark', 'spinner shark', 'silky shark', 'dusky shark',
        'bronze whaler', 'galapagos shark', 'sevengill shark', 'angel shark', 'goblin shark',
        'sandbar shark', 'dogfish', 'gill shark', 'thresher shark',  "4' shark", "5' shark", "6' shark",
        "8' shark", "10' shark", "12' shark"
    ]
    for species in known_species:
        if species in text:
            return species.capitalize()
    if 'not confirmed'  in text:
        return 'Shark involvement not confirmed'
    elif 'shark' in text:
        return 'Unidentified Shark'
    return 'Unknown'

# Applying the extraction function to the 'Species_' column
cleaned_data['Species_Cleaned'] = cleaned_data['Species_'].apply(extract_shark_species)

# Displaying the distribution of the cleaned species
species_distribution = cleaned_data['Species_Cleaned'].value_counts()
species_distribution.head(20)  


Species_Cleaned
Unknown                            2485
Unidentified Shark                  863
White shark                         621
Tiger shark                         277
Shark involvement not confirmed     220
Bull shark                          169
Nurse shark                          94
4' shark                             70
5' shark                             66
Whaler shark                         66
Blacktip shark                       63
Mako shark                           53
6' shark                             50
Wobbegong                            49
Reef shark                           49
Hammerhead                           44
Raggedtooth shark                    43
Spinner shark                        43
Blue shark                           40
Lemon shark                          35
Name: count, dtype: int64

## 2.12 Saving into .CSV

In [1520]:
# Specify the file path where you want to save the CSV file
csv_file_path = "/Users/mairagutierrez/Documents/Ironhack/PROJECTS/project--I/data/cleaning.csv"

# Export the clean data to a CSV file
cleaned_data.to_csv(csv_file_path, index=False)
