In [None]:
import pandas as pd

df.head()

In [None]:
df = pd.read_excel('GSAF5.xls')

In [None]:
df.info()

In [None]:
df["Type"].tail()

In [None]:
df["Injury"].isnull().sum()

In [None]:
df["Time"].isnull().sum()

In [None]:
df["Date"].isnull

In [None]:
import pandas as pd
import re

def clean_time(value):
    if pd.isna(value):
        return None

    value = str(value).strip().lower()

    # Remove useless or unclear values
    if value in ["?", "am", "pm", "unknown", "not stated", "n/a", "na"]:
        return None

    # Clean formats like "after 1200hr", "11.30hr", "15.5", etc.
    match = re.search(r'(\d{1,2})[h:.]?(\d{2})?', value)

    if match:
        hour = match.group(1)
        minute = match.group(2) if match.group(2) else "00"
        return f"{hour.zfill(2)}:{minute.zfill(2)}"

    # Keep known phrases like "Morning", "Afternoon", etc.
    return value.title()
from datetime import datetime



In [None]:
from datetime import datetime

def precise_time_to_day_part(value):
    if value is None:
        return None  # Keep missing as None

    # Known descriptive phrases to keep untouched
    descriptive_parts = [
        "Early Morning", "Morning", "Midday", "Early Afternoon",
        "Late Afternoon", "Afternoon", "Evening", "Dusk",
        "Night", "Late Night"
    ]
    
    if isinstance(value, str) and value.title() in descriptive_parts:
        return value.title()

    try:
        # Try parsing standard time like "14:30"
        time = datetime.strptime(value, "%H:%M").time()
        hour = time.hour
        minute = time.minute

        if 5 <= hour < 8:
            return "Early Morning"
        elif 8 <= hour < 12:
            return "Morning"
        elif hour == 12 and minute == 0:
            return "Midday"
        elif 12 <= hour < 15:
            return "Early Afternoon"
        elif 15 <= hour < 17:
            return "Late Afternoon"
        elif 17 <= hour < 19:
            return "Evening"
        elif 19 <= hour < 20:
            return "Dusk"
        elif 20 <= hour < 24:
            return "Night"
        else:  # 00:00 to before 5:00
            return "Late Night"
    except:
        return None  # Unrecognized values go to None


In [None]:
df['Cleaned_Time'] = df['Time'].apply(clean_time)
df['Day_Part'] = df['Cleaned_Time'].apply(precise_time_to_day_part)
print(df['Day_Part'].value_counts())


In [None]:
df['Cleaned_Time'].head(60)

In [None]:
print(df['Day_Part'].value_counts(dropna=False))


In [None]:
df["Date"].value_counts().tail(60)

In [None]:
import pandas as pd
import re


def clean_date(date):
    date = str(date)
    
    # Remove known unwanted words
    cleaned_date = re.sub(r'\b(Reported|Early|Before|No date|No Date)\b', '', date, flags=re.IGNORECASE)
    cleaned_date = re.sub(r'[^0-9a-zA-Z\-/ :]', '', cleaned_date).strip()

    # Try known formats first
    for fmt in ("%d-%b-%Y", "%d %b-%Y", "%Y-%m-%d %H:%M:%S", "%d-%m-%Y", "%Y-%m-%d"):
        try:
            return pd.to_datetime(cleaned_date, format=fmt, errors='raise')
        except:
            continue

    # Fall back to automatic parsing (dayfirst off for ISO formats)
    return pd.to_datetime(cleaned_date, errors='coerce', dayfirst=False)
df['Cleaned_Date'] = df['Date'].apply(clean_date)


In [None]:
df['Cleaned_Date'].tail(60)

In [None]:
def get_season(date):
    if pd.isna(date):
        return "No Date"
    
    month = date.month
    if month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Autumn"
    elif month in [12, 1, 2]:
        return "Winter"
    
    return "No Date"

# Apply season mapping
df['Season'] = df['Cleaned_Date'].apply(get_season)

print(df[['Date', 'Cleaned_Date', 'Season']])

In [None]:
df['Season'].value_counts()

In [None]:
# print(df["Injury"].value_counts().to_string())


In [None]:
counts = df["Injury"].value_counts()

# Filter where count is exactly 1
single_occurrences = counts[counts == 1]

# Show how many have count = 1
print(f"Number of unique injuries that appear only once: {len(single_occurrences)}")

In [None]:
df["Injury_clean"] = (
    df["Injury"]
    .str.lower()
    .str.strip()
    .str.replace(r'[^a-z\s]', '', regex=True)  # Remove non-letter characters
)

# Quick grouping preview
print(df["Injury_clean"].value_counts())

In [None]:
def simplify_injury(text):
    if pd.isna(text):
        return "unknown"
    text = text.lower()
    if "fatal" in text:
        return "fatal"
    elif "foot" in text:
        return "foot injury"
    elif "leg" in text:
        return "leg injury"
    elif "hand" in text:
        return "hand injury"
    elif "no injury" in text:
        return "no injury"
    else:
        return "other"

df["Injury_grouped"] = df["Injury"].apply(simplify_injury)
print(df["Injury_grouped"].value_counts())

In [None]:
print(df["Injury_grouped"].value_counts().to_string())


In [None]:
                                                 # HIPOLITO PART

#CLEAN THE FATAL COlUMN:


# Check the unique values in the column 'Fatal Y/N':
print(df['Fatal Y/N'].dropna().apply(lambda x: repr(x)).unique())

In [None]:
#Change to string and eliminate spaces
df['Fatal Y/N'] = df['Fatal Y/N'].astype(str).str.strip().str.upper()
#Check for valid values
valid_values = {'Y': 'Y', 'N': 'N'}
#Put all the good values the rest will be NaN
df['Fatal Y/N'] = df['Fatal Y/N'].map(valid_values)
#Result
print(df['Fatal Y/N'].unique())

In [None]:
# Count of values (Y, N)
print(df['Fatal Y/N'].value_counts(dropna=False))

In [None]:
# Check the unique values in the column 'Country'
print(df['Country'].unique())

In [None]:
#All in mayus and eliminate spaces
df['Country'] = df['Country'].str.strip().str.upper()
print(df['Country'].unique())


In [None]:
# CLEANING DATA OF COUNTRIES

import numpy as np

country_corrections = {
    # Correccions Ortografics
    'COLUMBIA': 'COLOMBIA',
    'TRINIDAD & TOBAGO': 'TRINIDAD AND TOBAGO',
    'MALDIVE ISLANDS': 'MALDIVES',
    'UNITED ARAB EMIRATES (UAE)': 'UNITED ARAB EMIRATES',
    'ST. MARTIN': 'ST MARTIN',
    'ST. MAARTIN': 'ST MARTIN',
    'TRINIDAD': 'TRINIDAD AND TOBAGO',

    # Agrupations
    'ENGLAND': 'UK',
    'SCOTLAND': 'UK',
    'UNITED KINGDOM': 'UK',
    'BRITISH ISLES': 'UK',
    'BRITISH WEST INDIES': 'UK',
    'BRITISH VIRGIN ISLANDS': 'UK',

    # Ocean y region not usefull
    'PACIFIC OCEAN': 'OTHER',
    'ATLANTIC OCEAN': 'OTHER',
    'INDIAN OCEAN': 'OTHER',
    'SOUTH PACIFIC OCEAN': 'OTHER',
    'CARIBBEAN SEA': 'OTHER',
    'OCEAN': 'OTHER',
    'GULF OF ADEN': 'OTHER',
    'MID-PACIFC OCEAN': 'OTHER',
    'NORTH ATLANTIC OCEAN': 'OTHER',
    'RED SEA': 'OTHER',
    'RED SEA / INDIAN OCEAN': 'OTHER',
    'NORTH PACIFIC OCEAN': 'OTHER',
    'CENTRAL PACIFIC': 'OTHER',

    # Some other mistakes → agrupar
    'DIEGO GARCIA': 'OTHER',
    'JOHNSTON ISLAND': 'OTHER',
    'ADMIRALTY ISLANDS': 'OTHER',
    'MID ATLANTIC OCEAN': 'OTHER',
    'UNKNOWN': 'OTHER',
    'AFRICA': 'OTHER',
    'ASIA?': 'OTHER',
    'SUDAN?': 'SUDAN',
}

In [None]:
# Import the country correction on or columns

def clean_column_country(df, column='Country'):
    # All mayus
    df[column] = df[column].str.strip().str.upper()
    #use the country_corrections to filter the column
    df[column] = df[column].replace(country_corrections)
    return df

df = clean_column_country(df, column='Country')

print(sorted(df['Country'].dropna().unique()))

In [None]:
# Top 10 countries with more sharks attacts:
top_10_paises = df['Country'].value_counts().head(10)
print(top_10_paises)

In [None]:
#

In [None]:
df.index

In [None]:
list(df.index)

In [None]:
df.columns
print(df.columns)

In [None]:
list(df.columns)
print(list(df.columns))

In [None]:
df["Location"]

In [None]:
df.Location

In [None]:
df[["Location"]]

In [None]:
df["State"]

In [None]:
df.State

In [None]:
df[["State"]].all

In [None]:
df.Location.unique()

In [None]:
import numpy as np 

df['Location'] = df['Location'].replace({
    'Panama Bay 8ºN, 79ºW': 'Panama Bay'
}) 
print(df['Location'])

In [None]:
def clean_column_location(df,column='Location'):
    df[column] = df[column].str.strip().str.upper()
    df[column] = df[column].replace(df['Location']).unique
    return df

df = clean_column_location(df,column='Location')
print(sorted(df['Location'].dropna().unique()))

In [None]:
df.State.unique()

In [None]:
df['State'].fillna(method='ffill').tail()

In [None]:
import numpy as np



State = {
    'Floria': 'Florida',
    'South Carolina ': 'South Carolina',
    'North Carolina ': 'North Carolina',
    'New  South Wales': 'New South Wales',
    'New South ales': 'New South Wales',
    'New South Wales ': 'New South Wales',
    'Baja ': 'Baja California',
    'Westerm Australia': 'Western Australia',
    'Maahvah Laamu Atoll': 'Laamu Atoll',
    'Grand  Bahama Island': 'Grand Bahama Island',
    'Isla De San Andres': 'San Andrés Island',
    'Lucayan Lucayan Archipelago': 'Lucayan Archipelago',
    'New Providence   Isoad': 'New Providence Island',
    'Hurghada, Red Sea Governorate': 'Red Sea Governorate',
    'KwaZulu-Natal between Port Edward and Port St Johns': 'KwaZulu-Natal',
    'Western  Australia': 'Western Australia',
    'Western Cape Province': 'Western Cape',
    'Noirth Carolina': 'North Carolina',
    'Guerro': 'Guerrero',
    'Guerrrero': 'Guerrero',
    'Namonuito Atoll': 'Micronesia',
    'Grand Baie': 'Mauritius',
    'Guantanamo Province': 'Guantánamo Province',
    'Unknown, treated at Wick, SCOTLAND': 'Unknown',
    'Bahamas': 'The Bahamas',
    'BAHAMAS': 'The Bahamas',
    'Exumas': 'Exuma Islands',
    'Grand Bahama Island': 'Grand Bahama Island',
    ' Grand Bahama Island': 'Grand Bahama Island',
    'South Santo': 'Espírito Santo',
    'Montego Bay': 'Jamaica',
    'Grande Terre': 'New Caledonia',
    '?': 'Unknown',
    'nan': 'Unknown',
    'Lucayan Lucayan Archipelago': 'Lucayan Archipelago',
    'South Province': 'Unknown',
    'KNZ': 'KwaZulu-Natal', 
    'New South ales': 'New South Wales',
    'Noirth Carolina': 'North Carolina',
    'KZN':'KwaZulu-Natal', 
    '40 miles off Grand Bahama Island': 'Unknown', 
    '740 miles SE of Tarawa Atoll':'Unknown',
    '300 miles from Antigua': 'Unknown',
    '800 miles from land': 'Unknown',
    '600 nm west of the Canary Islands': 'Unknown', 
    'KwaZulu-Natal between Port Edward and Port St Johns': 'Unknown',
    '12 miles off the north coast': 'Unknown',
    'New Territories': 'Unknown',
    'On the Kowloon penisula, south of Sai Kung': 'Unknown',
    'Between DR and Puerto Rico': 'Unknown', 
    'Between Honiara & Isabel Island': 'Unknown',
    "Ha'api": 'Unknown', 
    'South China Sea 200 miles from Hong Kong': 'Unknown',
    '200 nm southeast of Manila': 'Unknown', 
    "250 miles southwest of O'ahu, Hawaii": 'Unknown',
    'Near Bougainville (North Solomons)': 'Unknown', 
    'Off the Coromandel Peninsula, North Island': 'Unknown',
    '10ºS, 142ºE': 'Unknown', 
    '165  miles from Bermuda': 'Unknown',
    '25 km off the coast of Iran & 483km from mouth of Persian Gulf': 'Unknown',
    '19S, 178?E': 'Unknown', 
    '9.35N 79.35W': 'Unknown', 
    'Enroute from Suez to Aden (Yemen)': 'Unknown',
    '180 miles southeast of Okinawa': 'Unknown', 
    'In the English Channel ': 'Unknown',
    'Unknown, treated at Wick, SCOTLAND': 'Unknown',
    '33N, 68W': 'Unknown', 
    'Madang (WO)': 'Unknown', 
    'Between Timor & Darwin, Australia': 'Unknown',
    '400 miles southeast of Sri Lanka': 'Unknown',
    'In the Gulf Stream ': 'Unknown',
    'Between England & South Africa': 'Unknown', 
    'Mindanao': 'Unknown',
    'Between Hawaii & Wake Island': 'Unknown', 
    '1,000 miles east of Hawaii': 'Unknown', 
    'Central Province': 'Unknown',
    '1000 miles west of Hawaii': 'Unknown', 
    '18S / 50E': 'Unknown',
    '330 to 350 miles east of Wake Island': 'Unknown',
    'Between Kwajalein Atoll & Johnston Island': 'Unknown', 
    'In transit between Tinian and Leyte': 'Unknown',
    '300 miles east of Luzon': 'Unknown',
    'Bernardino Strait near Gulf of Leyte': 'Unknown',
    'Off Samar Island in the Gulf of Leyte': 'Unknown',
    'Lake Nicaragua (fresh water)': 'Unknown', 
    'Near the Fiji Islands': 'Unknown', 
    '40 miles south of Naples ': 'Unknown',
    'Northwest of Papua New Guinea': 'Unknown', 
    'Between Hawaii and U.S.A.': 'Unknown',
    'Off South American coast': 'Unknown',
    '04.05N-13.23W': 'Unknown', 
    '300 miles east of St. Thomas (Virgin Islands)': 'Unknown',
    'West of Ceylon (Sri  Lanka)': 'Unknown', 
    'Off Libya': 'Unknown',  
    'North of Pernambuco, Brazil': 'Unknown', 
    'In Convoy OB 274': 'Unknown', 
    '2 to 3 miles off Taboguilla Island, Pacific Ocean': 'Unknown', 
    '150 miles offshore': 'Unknown', 
    '60 miles north of San Domingo in the West Indies': 'Unknown', 
    '30 nm from Singapore': 'Unknown',
    'Somewhere between Philadelphia and Hiogo, Japan': 'Unknown', 
    'Between Hastings & Fairlight, Sussex': 'Unknown', 
    'Off the coast of South America': 'Unknown', 
    '22ºN, 88ºE': 'Unknown',
    '300 miles east of Mauritius': 'Unknown',
    'Between Australia & USA': 'Unknown', 
    "35º39 : 165º8'": 'Unknown', 
    'Between New Ireland & New Britain': 'Unknown'
}  


df['state_clean'] = df['State'].str.strip()

# Apply mapping
df['state_standardized'] = df['state_clean'].replace('State')

# Optional: Replace all nulls or ambiguous with 'Unknown'
df['state_standardized'] = df['state_standardized'].fillna('Unknown')

# Check unique cleaned states
cleaned_unique_state = df['state_standardized'].unique()


In [None]:
def clean_column_state(df, column='State'):
    df[column] = df[column].str.strip().str.upper()
    df[column] = df[column].replace('state')
    return df

df = clean_column_state(df,column='State')
print(sorted(df['State'].dropna().unique()))

In [None]:
# Top 10 State with more shark attack
top_10_state = df['State'].value_counts().head(10)
print(top_10_state)