## SHARK ATTACK

### Importamos la base de datos y verificamos los datos

In [1]:
import pandas as pd
import numpy as np

# Cargar el archivo de Excel usando Pandas
url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
df = pd.read_excel(url)

In [2]:
# Verificar valores faltantes
print("\nValores faltantes por columna:")
print(df.isnull().sum())



Valores faltantes por columna:
Date                25
Year                27
Type                43
Country             75
State              507
Location           590
Activity           611
Name               245
Sex                604
Age               3019
Injury              60
Unnamed: 11        587
Time              3551
Species           3157
Source              44
pdf                170
href formula       150
href               173
Case Number        171
Case Number.1      172
original order     170
Unnamed: 21       6968
Unnamed: 22       6967
dtype: int64


### Eliminamos columnas sobrantes y valores unicos

In [3]:
# # Eliminar columnas: pdf, href formula, href, Case Number.1, Case Number, original order, Unnamed: 21, Unnamed: 22, Source

df.drop(['pdf', 'href formula', 'href', 'Case Number', 'Case Number.1', 'original order', 'Unnamed: 21', 'Unnamed: 22', 'Source'], axis=1, inplace=True)


In [4]:
# Eliminar columnas con más del 50% de valores faltantes
missing_values_threshold = 0.5
df = df.loc[:, df.isnull().mean() < missing_values_threshold]

In [5]:
# Renombrar columna unnamed: 11 por Died
if "Unnamed: 11" in df.columns:
    df.rename(columns={"Unnamed: 11": "Died"}, inplace=True)

In [6]:
# Eliminar filas con más de 5 valores faltantes
df = df.dropna(thresh=len(df.columns) - 5)

In [7]:
# Eliminamos los valores numericos de Sex
df = df[df['Sex'].str.isdigit() == False]

In [8]:
# Modificar Sex los valores nan, Unknown, ., por NAN
df['Sex'] = df['Sex'].replace(['nan', 'Unknown', '.'], 'NaN')

# Reemplazar 'M x 2' por 'M'
df['Sex'] = df['Sex'].replace({'M x 2': 'M'})

# Reemplazar 'lli' por 'M'
df['Sex'] = df['Sex'].replace({'lli': 'M'})

# Reemplazar ' M' por 'M'
df['Sex'] = df['Sex'].replace({' M': 'M'})

# Reemplazar 'M ' por 'M'
df['Sex'] = df['Sex'].replace({'M ': 'M'})

# Reemplazar 'N' por 'M'
df['Sex'] = df['Sex'].replace({'N': 'M'})

# Reemplazar celdas vacias por NAN
df['Sex'] = df['Sex'].replace({''}, 'NaN')






In [40]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Date'].unique())
print("\nValores faltantes:")
print(df['Date'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['15 Mar 2024' '04 Mar 2024' '02 Mar-2024' ... '26-Mar-1703' '1700s'
 'Late 1600s  1728']

Valores faltantes:
0

Valores faltantes por columna:
Date           0
Year           0
Type          16
Country       35
State        356
Location     404
Activity    3556
Name          64
Sex            0
Age         4201
Injury        13
Died         458
Species     2610
dtype: int64


In [10]:
# Eliminamos los valores letras de Date
df = df[df['Date'].str.isdigit() == False]

# Eliminamos Reported de Date
df['Date'] = df['Date'].str.replace('Reported', '')

# Dejamos solo el nombre del mes en Date
df['Date'] = df['Date'].str.replace('([^\s]+\s)', '')

# Reemplazar celdas vacias por NAN
df['Date'] = df['Date'].replace({''}, 'NaN')

In [11]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Sex'].unique())
print("\nValores faltantes:")
print(df['Sex'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())




Valores unicos:
['F' 'M' 'NaN']

Valores faltantes:
0

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         460
Species     2703
dtype: int64


In [12]:
# Verificar valores unicos en la columna 'Date' y visualizarlos
print("\nValores unicos:")
print(df['Date'].unique())
print("\nValores faltantes:")
print(df['Date'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['15 Mar 2024' '04 Mar 2024' '02 Mar-2024' ... '1900-1905' '1883-1889'
 '1845-1853']

Valores faltantes:
0

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         460
Species     2703
dtype: int64


In [13]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Died'].unique())
print("\nValores faltantes:")
print(df['Died'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
[nan 'N' 'Y' 'M' 'F' 'n' 'Nq' 'UNKNOWN' 2017 'Y x 2' ' N' 'y']

Valores faltantes:
460

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         460
Species     2703
dtype: int64


In [14]:
# Reemplazar minisculas por mayusculas
df['Died'] = df['Died'].str.upper()

# Modificar Died los valores nan, UNKNOWN, M, F, Nq, 2017 , por NAN
df['Died'] = df['Died'].replace(['nan', ' nan', 'UNKNOWN', 'M', 'F', 'Nq', '2017', '', ' 2017', '2017 '], 'NaN')

# Reemplazar n, ' N' por 'N'
df['Died'] = df['Died'].replace({'n': 'N', ' N': 'N'})

# Reemplazar 'Y x 2', 'y'por 'Y'
df['Died'] = df['Died'].replace({'Y x 2': 'Y', 'y': 'Y'})

#Reemplazar 2017 por NAN
df['Died'] = df['Died'].replace({'2017': 'NaN'})

# Reemplazar celdas vacias por NAN
df['Died'] = df['Died'].replace({''}, 'NaN')

# Reemplazar celdas NAN por NAN
df['Died'] = df['Died'].replace({'NaN': 'NaN'})




In [15]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Type'].unique())
print("\nValores faltantes:")
print(df['Type'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Unprovoked' ' Provoked' 'Provoked' 'Questionable' 'Watercraft'
 'Sea Disaster' nan '?' 'Unverified' 'Invalid' 'Under investigation']

Valores faltantes:
18

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         461
Species     2703
dtype: int64


In [16]:
# Convertir los valores faltantes en NaN
df['Type'] = df['Type'].replace({''}, 'NaN')

In [17]:
# Visualizar los valores faltantes
print(df[df['Type'].isnull()])

               Date    Year Type       Country           State  \
191     01-Apr-2022  2022.0  NaN  SOUTH AFRICA             KZN   
207     08-Feb-2022  2022.0  NaN    COSTA RICA      Guanacoste   
228     16-Oct-2021  2021.0  NaN     AUSTRALIA      Queensland   
233     10-Sep-2021  2021.0  NaN         EGYPT             NaN   
254     21-Jul-2021  2021.0  NaN           USA         Florida   
342     21-Oct-2020  2020.0  NaN           USA  North Carolina   
347     21-Sep-2020  2020.0  NaN           USA          Hawaii   
351     16-Sep-2020  2020.0  NaN           USA         Florida   
354     08-Sep-2020  2020.0  NaN           USA         Florida   
364     19-Aug-2020  2020.0  NaN           USA  South Carolina   
393     04-Jun-2020  2020.0  NaN           USA        Delaware   
403     29-Apr-2020  2020.0  NaN           USA      California   
697     15-Sep-2017  2017.0  NaN         SAMOA    Upolu Island   
997     27-Jul-2015  2015.0  NaN     AUSTRALIA        Victoria   
5497    11

In [18]:
# Modificar Type los valores nan, '?', Invalid por NAN
df['Type'] = df['Type'].replace(['nan', '?', 'Invalid'], 'NaN')  


#Reemplazar ' Provoked' por 'Provoked'
df['Type'] = df['Type'].replace({' Provoked': 'Provoked'})

# Reemplazar celdas vacias por NAN
df['Type'] = df['Type'].replace({''}, 'NaN')

# Reemplazar minusculas por mayusculas
df['Type'] = df['Type'].str.upper()

# Reemplazar celdas vacias por NAN
df['Type'] = df['Type'].replace({''}, 'NaN')


In [19]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Country'].unique())
print("\nValores faltantes:")
print(df['Country'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['AUSTRALIA' 'USA' 'INDIA' 'TRINIDAD' 'BAHAMAS' 'SOUTH AFRICA' 'MEXICO'
 'NEW ZEALAND' 'EGYPT' 'Mexico' 'BELIZE' 'PHILIPPINES' 'Coral Sea' 'SPAIN'
 'PORTUGAL' 'SAMOA' 'COLOMBIA' 'ECUADOR' 'FRENCH POLYNESIA'
 'NEW CALEDONIA' 'TURKS and CaICOS' 'CUBA' 'BRAZIL' 'SEYCHELLES'
 'ARGENTINA' 'FIJI' 'MeXICO' 'Maldives' 'South Africa' 'ENGLAND' 'JAPAN'
 'INDONESIA' 'JAMAICA' 'MALDIVES' 'THAILAND' 'COLUMBIA' 'COSTA RICA'
 'New Zealand' 'British Overseas Territory' 'CANADA' 'JORDAN'
 'ST KITTS / NEVIS' 'ST MARTIN' 'PAPUA NEW GUINEA' 'REUNION ISLAND'
 'ISRAEL' 'CHINA' 'IRELAND' 'ITALY' 'MALAYSIA' nan 'MAURITIUS'
 'SOLOMON ISLANDS' 'ST HELENA, British overseas territory' 'REUNION'
 'UNITED KINGDOM' 'UNITED ARAB EMIRATES' 'CAPE VERDE' 'Fiji'
 'DOMINICAN REPUBLIC' 'CAYMAN ISLANDS' 'ARUBA' 'MOZAMBIQUE' 'PUERTO RICO'
 'ST. MARTIN' 'FRANCE' 'TRINIDAD & TOBAGO' 'KIRIBATI' 'DIEGO GARCIA'
 'TAIWAN' 'PALESTINIAN TERRITORIES' 'GUAM' 'NIGERIA' 'TONGA' 'SCOTLAND'
 'SAUDI ARABIA' 'CHILE' 'ANTIGU

In [20]:
# Reemplazar minisculas por mayusculas
df['Country'] = df['Country'].str.upper()

# Eliminar espacios en las celdas
df['Country'] = df['Country'].str.strip()

In [21]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['State'].unique())
print("\nValores faltantes:")
print(df['State'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Queensland' 'Hawaii' 'Western Australia' 'Maharashtra' nan
 'New  South Wales' 'Paradise Island' 'South Australia'
 'Eastern Cape Province' 'Sonora' 'New South Wales' 'South Island'
 'Guerrero' 'New Providence   Isoad' 'Jalisco' 'Grand  Bahama Island'
 'Florida' 'Westerm Australia' 'North Island' 'Southern Red Sea'
 'Quintana Roo' 'Belize District' 'California' 'South Carolina'
 'South Sinai' 'Bolinao' 'Valencia' 'Norte' "Savai'i island" 'New York'
 'San Andrés Island' 'Galapagos Islands' 'Red Sea Protectorate' 'Freeport'
 'Tahiti' 'Poum' 'Providenciales Island' 'Lucayan Lucayan Archipelago'
 'New Jersey' 'Mayabeque' 'Easten Cape Province' 'Artemisa' 'Pernambuco'
 'Praslin Island' 'South Province' 'Patagonia' 'Taveuni Island'
 'Gulf of California' 'Vaavu Atoll' 'KNZ' 'New South ales' 'Samoa' 'Texas'
 'Louisiana' 'Mississippi' 'Western  Australia' 'Victoria'
 'Western Cape Province' 'Cornwall' 'Noirth Carolina' 'Okinawa' 'Bali'
 'Hurghada, Red Sea Governorate' 'Amberg

In [22]:
# Reemplazar minisculas por mayusculas
df['State'] = df['State'].str.upper()

# Eliminar espacios en las celdas
df['State'] = df['State'].str.strip()

# Reemplazar celdas vacias por NAN
df['State'] = df['State'].replace({''}, 'NaN')

In [23]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Location'].unique())
print("\nValores faltantes:")
print(df['Location'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Bargara Beach' "Old Man's, Waikiki" 'Rainbows, Oahu' ...
 'Ocracoke Inlet' 'Panama Bay 8ºN, 79ºW'
 'Below the English fort, Trincomalee']

Valores faltantes:
439

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         461
Species     2703
dtype: int64


In [24]:
 # Reemplazar minisculas por mayusculas
df['Location'] = df['Location'].str.upper()

# Eliminar espacios en las celdas
df['Location'] = df['Location'].str.strip()

# Reemplazar celdas vacias por NAN
df['Location'] = df['Location'].replace({''}, 'NaN')


In [25]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Activity'].unique())
print("\nValores faltantes:")
print(df['Activity'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Swimming' 'Surfing' nan ... 'Swimming around anchored ship'
 'Crew swimming alongside their anchored ship' '4 men were bathing']

Valores faltantes:
381

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity     381
Name          65
Sex            0
Age         2249
Injury        15
Died         461
Species     2703
dtype: int64


In [26]:
# Reemplazar minisculas por mayusculas
df['Activity'] = df['Activity'].str.upper()

# Eliminar espacios en las celdas
df['Activity'] = df['Activity'].str.strip()

# Reemplazar celdas vacias por NAN
df['Activity'] = df['Activity'].replace({''}, 'NaN')

# Reemplazar los valores que no sean 'SURFING', 'SWIMMING' o 'FISHING' por NaN
valid_activities = ['SURFING', 'SWIMMING', 'FISHING']
df['Activity'] = df['Activity'].apply(lambda x: x if x in valid_activities else np.nan)

In [27]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Age'].unique())
print("\nValores faltantes:")
print(df['Age'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['13' nan '11' '46' '32' '29' '10' '64' '62' '39' '22' '15' '16' '21' '66'
 '52' '20' '44' '26' '47' '55' '30' '59' '8' '50' '45' '34' '38' '30s'
 '37' '19' '20/30' '35' '65' '20s' '77' '60' 38 '49' '42' '!2' '24' '73'
 '25' '18' '50s' '58' '67' '17' '14' '6' '41' '53' '68' '43' '40' '51'
 '31' 39 26 58 51 14 17 10 13 33 16 40 49 41 60 28 '40s' 68 35 62 'teen'
 20 8 22 32 56 'Teen' 12 21 42 36 18 37 50 'M' 9 24 15 11 27 57 34 25 '!6'
 31 64 '!!' 47 55 19 7 71 48 59 53 54 75 '45 and 15' 46 61 73 52 29 30 70
 23 4 63 45 44 '28 & 22' '60s' "20's" 43 65 67 74 '9 & 60' 'a minor' 6 69
 3 82 66 72 '23' '12' '9' '36' '63' '71' '48' '70' '18 months' '57' '7'
 '28' '33' '61' '74' '27' '3' '56' '28 & 26' '5' '54' '86' '18 or 20'
 '12 or 13' '46 & 34' '28, 23 & 30' 'Teens' 77 '8 or 10' 84 '\xa0 ' ' '
 '30 or 36' '6½' '21 & ?' '33 or 37' 'mid-30s' '23 & 20' 5 ' 30'
 '7      &    31' ' 28' '20?' "60's" '69' '32 & 30' '16 to 18' '87'
 'Elderly' 'mid-20s' 'Ca. 33' '74 ' '45 ' '21 or 2

In [28]:
# Eliminar espacios en las celdas
df['Age'] = df['Age'].str.strip()

# Reemplazar celdas vacias por NAN
df['Age'] = df['Age'].replace({''}, 'NaN')

# Convertir la columna 'Age' a numérica, con valores no numéricos convertidos a NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')



In [29]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Injury'].unique())
print("\nValores faltantes:")
print(df['Injury'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Minor injuries to back, abdomen and legs'
 'No injury, shark bit surfboard' 'Lacerations to left foot' ...
 'FATAL, leg stripped of flesh  '
 'FATAL, knocked overboard by tail of shark & carried off by shark '
 'FATAL. "Shark bit him in half, carrying away the lower extremities" ']

Valores faltantes:
15

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity    3666
Name          65
Sex            0
Age         4325
Injury        15
Died         461
Species     2703
dtype: int64


In [30]:
# Eliminar espacios en las celdas
df['Injury'] = df['Injury'].str.strip()

# Reemplazar celdas vacias por NAN
df['Injury'] = df['Injury'].replace({''}, 'NaN')

# Reemplazar minusculas por mayusculas
df['Injury'] = df['Injury'].str.upper()

In [31]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Species '].unique())
print("\nValores faltantes:")
print(df['Species '].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Tiger shark' "Tiger shark 8'" "3' to 4' shark" ... 'Blue pointer'
 'Blue pointers'
 'Said to involve a grey nurse shark that leapt out of the water and  seized the boy but species identification is questionable']

Valores faltantes:
2703

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity    3666
Name          65
Sex            0
Age         4325
Injury        15
Died         461
Species     2703
dtype: int64


In [32]:
# Reemplazar minisculas por mayusculas
df['Species '] = df['Species '].str.upper()

# Reemplazar celdas vacias por NAN
df['Species '] = df['Species '].replace({''}, 'NaN')

In [33]:
# Valores unicos
print("\nValores unicos:")
print(df['Species '].unique())
print("\nValores faltantes:")
print(df['Species '].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['TIGER SHARK' "TIGER SHARK 8'" "3' TO 4' SHARK" ... 'BLUE POINTER'
 'BLUE POINTERS'
 'SAID TO INVOLVE A GREY NURSE SHARK THAT LEAPT OUT OF THE WATER AND  SEIZED THE BOY BUT SPECIES IDENTIFICATION IS QUESTIONABLE']

Valores faltantes:
2703

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity    3666
Name          65
Sex            0
Age         4325
Injury        15
Died         461
Species     2703
dtype: int64


### ELIMINAMOS DUPLICADOS

In [34]:
df.duplicated().sum()

2

In [35]:
df.duplicated().any()

True

In [36]:
df.drop_duplicates(inplace=True)

In [37]:
# Valores nulos
df.isnull().sum()

Date           0
Year           2
Type          18
Country       38
State        381
Location     439
Activity    3664
Name          65
Sex            0
Age         4323
Injury        15
Died         461
Species     2702
dtype: int64

### Eliminamos las filas de 1642 para atras

In [41]:
# Eliminar las filas de menos del year 1700
df = df[df['Year'] >= 1700]

In [39]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Died'].unique())
print("\nValores faltantes:")
print(df['Died'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
[nan 'N' 'Y' 'NaN' 'NQ' 'Y X 2']

Valores faltantes:
458

Valores faltantes por columna:
Date           0
Year           0
Type          16
Country       35
State        356
Location     404
Activity    3556
Name          64
Sex            0
Age         4201
Injury        13
Died         458
Species     2610
dtype: int64


In [42]:
# Creamos una nueva columna llamada Month
df['Month'] = pd.DatetimeIndex(df['Date']).month

DateParseError: Unknown datetime string format, unable to parse: 09 Sep- 2023, at position 42