## SHARK ATTACK

### Importamos la base de datos y verificamos los datos

In [1]:
import pandas as pd
import numpy as np

# Cargar el archivo de Excel usando Pandas
url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
df = pd.read_excel(url)

In [2]:
# Verificar valores faltantes
print("\nValores faltantes por columna:")
print(df.isnull().sum())



Valores faltantes por columna:
Date                25
Year                27
Type                43
Country             75
State              507
Location           590
Activity           611
Name               245
Sex                604
Age               3019
Injury              60
Unnamed: 11        587
Time              3551
Species           3157
Source              44
pdf                170
href formula       150
href               173
Case Number        171
Case Number.1      172
original order     170
Unnamed: 21       6968
Unnamed: 22       6967
dtype: int64


### Eliminamos columnas sobrantes y valores unicos

In [3]:
# # Eliminar columnas: pdf, href formula, href, Case Number.1, Case Number, original order, Unnamed: 21, Unnamed: 22, Source

df.drop(['pdf', 'href formula', 'href', 'Case Number', 'Case Number.1', 'original order', 'Unnamed: 21', 'Unnamed: 22', 'Source'], axis=1, inplace=True)


In [4]:
# Eliminar columnas con más del 50% de valores faltantes
missing_values_threshold = 0.5
df = df.loc[:, df.isnull().mean() < missing_values_threshold]

In [5]:
# Renombrar columna unnamed: 11 por Died
if "Unnamed: 11" in df.columns:
    df.rename(columns={"Unnamed: 11": "Died"}, inplace=True)

In [6]:
# Eliminar filas con más de 5 valores faltantes
df = df.dropna(thresh=len(df.columns) - 5)

In [7]:
# Eliminamos los valores numericos de Sex
df = df[df['Sex'].str.isdigit() == False]

In [8]:
# Modificar Sex los valores nan, Unknown, ., por NAN
df['Sex'] = df['Sex'].replace(['nan', 'Unknown', '.'], 'NaN')

# Reemplazar 'M x 2' por 'M'
df['Sex'] = df['Sex'].replace({'M x 2': 'M'})

# Reemplazar 'lli' por 'M'
df['Sex'] = df['Sex'].replace({'lli': 'M'})

# Reemplazar ' M' por 'M'
df['Sex'] = df['Sex'].replace({' M': 'M'})

# Reemplazar 'M ' por 'M'
df['Sex'] = df['Sex'].replace({'M ': 'M'})

# Reemplazar 'N' por 'M'
df['Sex'] = df['Sex'].replace({'N': 'M'})

# Reemplazar celdas vacias por NAN
df['Sex'] = df['Sex'].replace({''}, 'NaN')






In [9]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Sex'].unique())
print("\nValores faltantes:")
print(df['Sex'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())




Valores unicos:
['F' 'M' 'NaN']

Valores faltantes:
0

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [10]:
# Verificar valores unicos en la columna 'Date' y visualizarlos
print("\nValores unicos:")
print(df['Date'].unique())
print("\nValores faltantes:")
print(df['Date'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['15 Mar 2024' '04 Mar 2024' '02 Mar-2024' ... '1900-1905' '1883-1889'
 '1845-1853']

Valores faltantes:
0

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [11]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Died'].unique())
print("\nValores faltantes:")
print(df['Died'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
[nan 'N' 'Y' 'M' 'F' 'n' 'Nq' 'UNKNOWN' 2017 'Y x 2' ' N' 'y']

Valores faltantes:
478

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [12]:
# Modificar Died los valores nan, UNKNOWN, M, F, Nq, 2017 , por NAN
df['Died'] = df['Died'].replace(['nan', ' nan', 'UNKNOWN', 'M', 'F', 'Nq', '2017', '', ' 2017', '2017 '], 'NaN')

# Reemplazar n, ' N' por 'N'
df['Died'] = df['Died'].replace({'n': 'N', ' N': 'N'})

# Reemplazar 'Y x 2', 'y'por 'Y'
df['Died'] = df['Died'].replace({'Y x 2': 'Y', 'y': 'Y'})

#Reemplazar 2017 por NAN
df['Died'] = df['Died'].replace({'2017': 'NaN'})

# Reemplazar celdas vacias por NAN
df['Died'] = df['Died'].replace({''}, 'NaN')

# Reemplazar celdas NAN por NAN
df['Died'] = df['Died'].replace({'NaN': 'NaN'})




In [13]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Type'].unique())
print("\nValores faltantes:")
print(df['Type'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Unprovoked' ' Provoked' 'Provoked' 'Questionable' 'Watercraft'
 'Sea Disaster' nan '?' 'Unverified' 'Invalid' 'Under investigation']

Valores faltantes:
18

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [14]:
# Modificar Type los valores nan, '?', Invalid por NAN
df['Type'] = df['Type'].replace(['nan', '?', 'Invalid'], 'NaN')  


#Reemplazar ' Provoked' por 'Provoked'
df['Type'] = df['Type'].replace({' Provoked': 'Provoked'})

# Reemplazar celdas vacias por NAN
df['Type'] = df['Type'].replace({''}, 'NaN')

# Reemplazar minusculas por mayusculas
df['Type'] = df['Type'].str.upper()

# Reemplazar celdas vacias por NAN
df['Type'] = df['Type'].replace({''}, 'NaN')


In [15]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Country'].unique())
print("\nValores faltantes:")
print(df['Country'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['AUSTRALIA' 'USA' 'INDIA' 'TRINIDAD' 'BAHAMAS' 'SOUTH AFRICA' 'MEXICO'
 'NEW ZEALAND' 'EGYPT' 'Mexico' 'BELIZE' 'PHILIPPINES' 'Coral Sea' 'SPAIN'
 'PORTUGAL' 'SAMOA' 'COLOMBIA' 'ECUADOR' 'FRENCH POLYNESIA'
 'NEW CALEDONIA' 'TURKS and CaICOS' 'CUBA' 'BRAZIL' 'SEYCHELLES'
 'ARGENTINA' 'FIJI' 'MeXICO' 'Maldives' 'South Africa' 'ENGLAND' 'JAPAN'
 'INDONESIA' 'JAMAICA' 'MALDIVES' 'THAILAND' 'COLUMBIA' 'COSTA RICA'
 'New Zealand' 'British Overseas Territory' 'CANADA' 'JORDAN'
 'ST KITTS / NEVIS' 'ST MARTIN' 'PAPUA NEW GUINEA' 'REUNION ISLAND'
 'ISRAEL' 'CHINA' 'IRELAND' 'ITALY' 'MALAYSIA' nan 'MAURITIUS'
 'SOLOMON ISLANDS' 'ST HELENA, British overseas territory' 'REUNION'
 'UNITED KINGDOM' 'UNITED ARAB EMIRATES' 'CAPE VERDE' 'Fiji'
 'DOMINICAN REPUBLIC' 'CAYMAN ISLANDS' 'ARUBA' 'MOZAMBIQUE' 'PUERTO RICO'
 'ST. MARTIN' 'FRANCE' 'TRINIDAD & TOBAGO' 'KIRIBATI' 'DIEGO GARCIA'
 'TAIWAN' 'PALESTINIAN TERRITORIES' 'GUAM' 'NIGERIA' 'TONGA' 'SCOTLAND'
 'SAUDI ARABIA' 'CHILE' 'ANTIGU

In [16]:
# Reemplazar minisculas por mayusculas
df['Country'] = df['Country'].str.upper()

# Eliminar espacios en las celdas
df['Country'] = df['Country'].str.strip()

In [17]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['State'].unique())
print("\nValores faltantes:")
print(df['State'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Queensland' 'Hawaii' 'Western Australia' 'Maharashtra' nan
 'New  South Wales' 'Paradise Island' 'South Australia'
 'Eastern Cape Province' 'Sonora' 'New South Wales' 'South Island'
 'Guerrero' 'New Providence   Isoad' 'Jalisco' 'Grand  Bahama Island'
 'Florida' 'Westerm Australia' 'North Island' 'Southern Red Sea'
 'Quintana Roo' 'Belize District' 'California' 'South Carolina'
 'South Sinai' 'Bolinao' 'Valencia' 'Norte' "Savai'i island" 'New York'
 'San Andrés Island' 'Galapagos Islands' 'Red Sea Protectorate' 'Freeport'
 'Tahiti' 'Poum' 'Providenciales Island' 'Lucayan Lucayan Archipelago'
 'New Jersey' 'Mayabeque' 'Easten Cape Province' 'Artemisa' 'Pernambuco'
 'Praslin Island' 'South Province' 'Patagonia' 'Taveuni Island'
 'Gulf of California' 'Vaavu Atoll' 'KNZ' 'New South ales' 'Samoa' 'Texas'
 'Louisiana' 'Mississippi' 'Western  Australia' 'Victoria'
 'Western Cape Province' 'Cornwall' 'Noirth Carolina' 'Okinawa' 'Bali'
 'Hurghada, Red Sea Governorate' 'Amberg

In [18]:
# Reemplazar minisculas por mayusculas
df['State'] = df['State'].str.upper()

# Eliminar espacios en las celdas
df['State'] = df['State'].str.strip()

# Reemplazar celdas vacias por NAN
df['State'] = df['State'].replace({''}, 'NaN')

In [19]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Location'].unique())
print("\nValores faltantes:")
print(df['Location'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Bargara Beach' "Old Man's, Waikiki" 'Rainbows, Oahu' ...
 'Ocracoke Inlet' 'Panama Bay 8ºN, 79ºW'
 'Below the English fort, Trincomalee']

Valores faltantes:
488

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [20]:
 # Reemplazar minisculas por mayusculas
df['Location'] = df['Location'].str.upper()

# Eliminar espacios en las celdas
df['Location'] = df['Location'].str.strip()

# Reemplazar celdas vacias por NAN
df['Location'] = df['Location'].replace({''}, 'NaN')


In [21]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Activity'].unique())
print("\nValores faltantes:")
print(df['Activity'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['Swimming' 'Surfing' nan ... 'Swimming around anchored ship'
 'Crew swimming alongside their anchored ship' '4 men were bathing']

Valores faltantes:
423

Valores faltantes por columna:
Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity     423
Name          67
Sex            0
Age         2445
Injury        17
Died         478
Species     2870
dtype: int64


In [22]:
# Reemplazar minisculas por mayusculas
df['Activity'] = df['Activity'].str.upper()

# Eliminar espacios en las celdas
df['Activity'] = df['Activity'].str.strip()

# Reemplazar celdas vacias por NAN
df['Activity'] = df['Activity'].replace({''}, 'NaN')

# Reemplazar los valores que no sean 'SURFING', 'SWIMMING' o 'FISHING' por NaN
valid_activities = ['SURFING', 'SWIMMING', 'FISHING']
df['Activity'] = df['Activity'].apply(lambda x: x if x in valid_activities else np.nan)

In [33]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Age'].unique())
print("\nValores faltantes:")
print(df['Age'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
[13. nan 11. 46. 32. 29. 10. 64. 62. 39. 22. 15. 16. 21. 66. 52. 20. 44.
 26. 47. 55. 30. 59.  8. 50. 45. 34. 38. 37. 19. 35. 65. 77. 60. 49. 42.
 24. 73. 25. 18. 58. 67. 17. 14.  6. 41. 53. 68. 43. 40. 51. 31. 23. 12.
  9. 36. 63. 71. 48. 70. 57.  7. 28. 33. 61. 74. 27.  3. 56.  5. 54. 86.
 69. 87.  4. 81. 78.  1.]

Valores faltantes:
4424

Valores faltantes por columna:
Date           0
Year           0
Type          16
Country       38
State        386
Location     453
Activity    3742
Name          66
Sex            0
Age         4424
Injury        15
Died         475
Species     2775
dtype: int64


In [32]:
# Eliminar espacios en las celdas
df['Age'] = df['Age'].str.strip()

# Reemplazar celdas vacias por NAN
df['Age'] = df['Age'].replace({''}, 'NaN')

# Convertir la columna 'Age' a numérica, con valores no numéricos convertidos a NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')



In [40]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Injury'].unique())
print("\nValores faltantes:")
print(df['Injury'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
['MINOR INJURIES TO BACK, ABDOMEN AND LEGS'
 'NO INJURY, SHARK BIT SURFBOARD' 'LACERATIONS TO LEFT FOOT' ...
 'RIGHT LEG SEVERED AT KNEE.  IN 1796 HE BECAME LORD MAYOR OF LONDON. IN 1778 HE COMMISSIONED  AMERICAN ARTIST, JOHN SINGLETON COPLEY, TO PAINT THE INCIDENT: WATSON AND THE SHARK'
 'FATAL,  PARTIAL REMAINS RECOVERED FROM SHARK’S GUT'
 'HAND AND FOOT SEVERELY BITTEN, SURGICALLY AMPUTATED']

Valores faltantes:
15

Valores faltantes por columna:
Date           0
Year           0
Type          16
Country       38
State        386
Location     453
Activity    3742
Name          66
Sex            0
Age         4424
Injury        15
Died         475
Species     2775
dtype: int64


In [39]:
# Eliminar espacios en las celdas
df['Injury'] = df['Injury'].str.strip()

# Reemplazar celdas vacias por NAN
df['Injury'] = df['Injury'].replace({''}, 'NaN')

# Reemplazar minusculas por mayusculas
df['Injury'] = df['Injury'].str.upper()

### ELIMINAMOS DUPLICADOS

In [24]:
df.duplicated().sum()

3

In [25]:
df.duplicated().any()

True

In [26]:
df.drop_duplicates(inplace=True)

In [27]:
# Valores nulos
df.isnull().sum()

Date           0
Year           2
Type          18
Country       41
State        411
Location     488
Activity    3851
Name          67
Sex            0
Age         2442
Injury        17
Died         478
Species     2868
dtype: int64

### Eliminamos las filas de 1642 para atras

In [28]:
# Eliminar las filas de menos del year 1642
df = df[df['Year'] >= 1642]

In [29]:
# Verificar valores unicos y visualizarlos
print("\nValores unicos:")
print(df['Died'].unique())
print("\nValores faltantes:")
print(df['Died'].isnull().sum())
print("\nValores faltantes por columna:")
print(df.isnull().sum())


Valores unicos:
[nan 'N' 'Y' 'NaN' 2017]

Valores faltantes:
475

Valores faltantes por columna:
Date           0
Year           0
Type          16
Country       38
State        386
Location     453
Activity    3742
Name          66
Sex            0
Age         2328
Injury        15
Died         475
Species     2775
dtype: int64
