## 3. Cleaning

In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from fuzzywuzzy import process
import pycountry


In [207]:
def get_clean_data():
    # File path to your CSV file
    file_path = '/Users/mairagutierrez/Documents/Ironhack/PROJECTS/project--I/data/pre_cleaning.csv'

    # Try reading the file with a different encoding
    cleaned_data = pd.read_csv(file_path)

    return cleaned_data

data = get_clean_data()
data.head(3) 


Unnamed: 0,Case_Number,Date,Year,Extracted_Month,Type,Country,Area,Location,Activity,Name,...,Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order,Injury_Category,Species_Grouped,Species_Cleaned
0,2018.06.25,25-Jun-2018,2018,6.0,Boating,usa,California,"Oceanside, San Diego County",surfing,Julie Wolfe,...,18h00,white shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0,No Injury,white shark,White shark
1,2018.06.18,18-Jun-2018,2018,6.0,Unprovoked,usa,Georgia,"St. Simon Island, Glynn County",bathing,Adyson McNeely,...,14h00 -15h00,unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0,Minor Injury,unknown,Unknown
2,2018.06.09,09-Jun-2018,2018,6.0,Invalid,usa,Hawaii,"Habush, Oahu",surfing,John Denges,...,07h45,unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0,Other non-fatal attacks,unknown,Unknown


### (Making sure the columns I am goint to use does not have missing values)

## 1. Year

In [208]:
has_null_values = data['Year'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")


Does not have null values.


## 2. Extracted Month

In [209]:
# Check if it has null values
has_null_values = data['Extracted_Month'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")


Has null values.


In [210]:
# Replacing the missing values in the "Extracted_Month" column for the specified rows
data.at[389, 'Extracted_Month'] = 7.0
data.at[5486, 'Extracted_Month'] = 9.0

# Verifying the changes
data.loc[[389, 5486], 'Extracted_Month']


389     7.0
5486    9.0
Name: Extracted_Month, dtype: float64

In [211]:
# Second Check if it still has null values
has_null_values = data['Extracted_Month'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 2. Type

In [212]:
# Check if it has null values
has_null_values = data['Type'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 2. Country

In [213]:
# Check if it has null values
has_null_values = data['Country'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 3. Activity

In [214]:
# Check if it has null values
has_null_values = data['Activity'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Has null values.


In [215]:
# Replacing the null values in the "Activity" column with "canoe"
data['Activity'].fillna('canoe', inplace=True)

# Verifying the changes
data.loc[[2510, 5558], 'Activity']


2510    canoe
5558    canoe
Name: Activity, dtype: object

In [216]:
# Second Check if still has null values
has_null_values = data['Activity'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 4. Sex

In [217]:
# Check if still has null values
has_null_values = data['Sex_'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


In [218]:
data['Sex_'] = pd.Categorical(data['Sex_'], categories=['Male', 'Female', 'Unknown'])
data


Unnamed: 0,Case_Number,Date,Year,Extracted_Month,Type,Country,Area,Location,Activity,Name,...,Time,Species_,Investigator_or_Source,pdf,href_formula,href,original_order,Injury_Category,Species_Grouped,Species_Cleaned
0,2018.06.25,25-Jun-2018,2018,6.0,Boating,usa,California,"Oceanside, San Diego County",surfing,Julie Wolfe,...,18h00,white shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0,No Injury,white shark,White shark
1,2018.06.18,18-Jun-2018,2018,6.0,Unprovoked,usa,Georgia,"St. Simon Island, Glynn County",bathing,Adyson McNeely,...,14h00 -15h00,unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0,Minor Injury,unknown,Unknown
2,2018.06.09,09-Jun-2018,2018,6.0,Invalid,usa,Hawaii,"Habush, Oahu",surfing,John Denges,...,07h45,unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0,Other non-fatal attacks,unknown,Unknown
3,2018.06.08,08-Jun-2018,2018,6.0,Unprovoked,Australia,New South Wales,Arrawarra Headland,surfing,male,...,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6300.0,Minor Injury,2 m shark,Unidentified Shark
4,2018.06.04,04-Jun-2018,2018,6.0,Provoked,Mexico,Colima,La Ticla,diving,Gustavo Ramos,...,,"tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6299.0,Laceration,"tiger shark, 3m",Tiger shark
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5557,1900.07.14,14-Jul-1900,1900,7.0,Invalid,usa,Hawaii,"Makapu'u Point, O'ahu",hunting seashells,Emil Uhlbrecht & unidentified person,...,,questionable,"Los Angeles Times, 7/28/1900",1900.07.14-Uhlbrecht.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,744.0,Other non-fatal attacks,questionable,Unknown
5558,1900.07.00,Late Jul-1900,1900,7.0,Provoked,usa,Connecticut,"Bridgeport, Fairfield County",canoe,"skiff with Dr. William T. Healey, Dr. Henry Ca...",...,,unknown,"Times, 8/1/1900",1900.07.00-Bridgeport.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,743.0,No Injury,unknown,Unknown
5559,1900.01.28,28-Jan-1900,1900,1.0,Unprovoked,Australia,New South Wales,"Lane Cove River, Sydney Harbor (Estuary)",bathing,Charles Duck,...,12h00,unknown,"Poverty Bay Herald, 2/12/1900",1900.01.28-Duck.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,742.0,Bitten,unknown,Unknown
5560,1900.00.00.,Early 1900s,1900,0.0,Unprovoked,usa,Hawaii,"Inter-Island Dry Dock at Kakaako Street, Honol...",unknown,Emil A. Berndt,...,,unknown,"G. H. Balazs; J. Borg, p.69; L. Taylor (1993),...",1900.00.00.b-Berndt.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,741.0,Other non-fatal attacks,unknown,Unknown


## 5. Age

In [219]:
# Check if still has null values
has_null_values = data['Age'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Has null values.


In [220]:
# Step 1: Replace null values in the "Age" column with -1
data['Age'].fillna(-1, inplace=True)

# Step 2: Convert the data type of the "Age" column to integer
data['Age'] = data['Age'].astype(int)

# Verifying the changes
age_data_type_updated = data['Age'].dtype
age_data_type_updated

dtype('int64')

In [221]:
# Second Check if still has null values
has_null_values = data['Activity'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 6. Injury Category

In [222]:
# Check if still has null values
has_null_values = data['Injury_Category'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 7. Fatal_(Y/N)

In [223]:
# Check if still has null values
has_null_values = data['Fatal_(Y/N)'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


In [224]:
# Convert 'Fatal_(Y/N)' column to a categorical type with 'Yes' and 'No' as the only categories
#filtered_data = data.copy()
filtered_data['Fatal_(Y/N)'] = pd.Categorical(filtered_data['Fatal_(Y/N)'], categories=['Yes', 'No'])

data = filtered_data
data['Fatal_(Y/N)'].unique()

['No', 'Yes', NaN]
Categories (2, object): ['Yes', 'No']

## 8. Species Cleaned

In [227]:
# Check if still has null values
has_null_values = data['Species_Cleaned'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


## 9. Original Order

In [228]:
# Check if still has null values
has_null_values = data['original_order'].isnull().any()
if has_null_values:
    print("Has null values.")
else:
    print("Does not have null values.")

Does not have null values.


In [229]:
# This is a good column for graphics as it has as many unique values as the DataFrame has rows 
data['original_order'].nunique()

5562

In [231]:
# Specify the file path where you want to save the CSV file
csv_file_path = "/Users/mairagutierrez/Documents/Ironhack/PROJECTS/project--I/data/cleaning.csv"

# Export the clean data to a CSV file
data.to_csv(csv_file_path, index=False)