In [2]:
import pandas as pd
!pip install xlrd



In [3]:
# Hypothesis 1
# Unprovoked shark attack incidents have a lower fatality rate than provoked incidents.

# Loading Data
shark_data = pd.read_excel('https://www.sharkattackfile.net/spreadsheets/GSAF5.xls')
shark_data.head()
s_d = shark_data.iloc[:,:14]

# cleaning

s_d = s_d.dropna(how='all')
s_d = s_d.drop_duplicates()

s_d["Injury"].value_counts()
shark_data["Injury"].nunique()

s_d["Type"].replace(" Provoked","Provoked",inplace=True)
s_d["Injury"] = s_d["Injury"].apply(lambda x: x.lower() if type(x) == str else x) # making sure spelling is the same

## Creating filters to filter for "fatal" and "non-fatal"

condition1 = s_d["Injury"].str.contains("fatal") == True
condition11 = s_d["Injury"].str.contains("non-fatal") != True
condition12 = s_d["Injury"].str.contains("not confirmed") != True
condition13 = s_d["Injury"].str.contains("unconfirmed") != True

condition2 =  s_d["Injury"].str.contains("fatal") != True
condition21 =  s_d["Injury"].str.contains("unknown") != True

condition3 = s_d["Injury"].str.contains("non-fatal") == True

### Creating filtered data frames

filter_fatal = s_d[condition1 & condition11 & condition12 & condition13]
filter_non_fatal = s_d[condition2 & condition21]
filter_non_fatal2 = s_d[condition3]  

s_d_unprovoked = s_d[s_d["Type"] == "Unprovoked"]
s_d_provoked = s_d[s_d["Type"] == "Provoked"]

#### Replace different values with coherent single values

filter_fatal["Injury"] = filter_fatal["Injury"].apply(lambda x: "fatal")
filter_non_fatal["Injury"] = filter_non_fatal["Injury"].apply(lambda x: "non-fatal")
filter_non_fatal2["Injury"] = filter_non_fatal2["Injury"].apply(lambda x: "non-fatal")
filter_non_fatal_concat = pd.concat([filter_non_fatal2, filter_non_fatal])

##### Creating filtered data frames

s_d_unprovoked_fatal = filter_fatal[filter_fatal["Type"] == "Unprovoked"]
s_d_unprovoked_non_fatal = filter_non_fatal_concat[filter_non_fatal_concat["Type"] == "Unprovoked"]

s_d_provoked_fatal = filter_fatal[filter_fatal["Type"] == "Provoked"]
s_d_provoked_non_fatal = filter_non_fatal_concat[filter_non_fatal_concat["Type"] == "Provoked"]

# Testing
## Creating a final summarized data frame from the separate filterd dfs

df_h1 = pd.concat([s_d_unprovoked_fatal, s_d_unprovoked_non_fatal, s_d_provoked_non_fatal, s_d_provoked_fatal])
df_h1.groupby(["Injury","Type"])["Injury"].count()
df_h1["count"] = 1
df_h1["count"].value_counts()

# Result

df_h1.pivot_table(index = "Injury", columns = "Type", values = "count", aggfunc = "sum" )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  s_d["Type"].replace(" Provoked","Provoked",inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_fatal["Injury"] = filter_fatal["Injury"].apply(lambda x: "fatal")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

Type,Provoked,Unprovoked
Injury,Unnamed: 1_level_1,Unnamed: 2_level_1
fatal,17,1227
non-fatal,620,3914


In [4]:
# Hypothesis 2 
# Great White Sharks are most likely to attack in the USA. 

# Loading Data
import pandas as pd
shark_data =pd.read_excel('https://www.sharkattackfile.net/spreadsheets/GSAF5.xls')
shark_data.head()
# Cleaning: Species
shark_data.rename(columns={"Species ": "Species"}, inplace=True) #rename species column
#shark_data["Species"].drop(shark_data[(shark_data == 0).any(axis=1)].index, inplace=True) # removes 0 value
#shark_data.dropna(subset=['Species'], inplace=True) #removes NaN
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Tiger Shark" if "tiger" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Bull Shark" if "bull" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Blue Pointer" if "blue" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Great White Shark" if "white" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Hammerhead Shark" if "hammer" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Catshark" if "cat" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Hammerhead Shark" if "hammer" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Brown Shark" if "brown" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Blacktip" if "black" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "NaN" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "questionable" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "involvement" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "'" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "''" in str(x).lower() else x)
shark_data["Species"] = shark_data["Species"].apply(lambda x: "Invalid" if "[]" in str(x).lower() else x)
# Cleaning: Country
def clean_country(country):
    # Handle NaN values
    if pd.isna(country):
        return 'UNSPECIFIED COUNTRY'
    # Remove question marks and strip whitespace
    country = country.replace('?', '').strip()
    # Handle cases where multiple countries are listed
    country = country.replace('IRAN / IRAQ', 'IRAN') \
                     .replace('SOLOMON ISLANDS / VANUATU', 'SOLOMON ISLANDS') \
                     .replace('EQUATORIAL GUINEA / CAMEROON', 'CAMEROON') \
                     .replace('CEYLON (SRI LANKA)', 'SRI LANKA') \
                     .replace('EGYPT / ISRAEL', 'EGYPT') \
                     .replace('ITALY / CROATIA', 'ITALY') \
                     .replace('BETWEEN PORTUGAL & INDIA', 'UNSPECIFIED COUNTRY') \
                     .replace('DIEGO GARCIA', 'UNSPECIFIED COUNTRY')
    # Replace "/" with "and" in specific countries
    country = country.replace('ANDAMAN / NICOBAR ISLANDS', 'ANDAMAN AND NICOBAR ISLANDS') \
                     .replace('ST KITTS / NEVIS', 'ST KITTS AND NEVIS')
    # Mapping replacements for specific island entries
    replacements = {
        'UNITED ARAB EMIRATES (UAE)': 'UNITED ARAB EMIRATES',
        'NEW GUINEA / PAPUA NEW GUINEA': 'PAPUA NEW GUINEA',
        'SOLOMON ISLANDS / VANUATU': 'SOLOMON ISLANDS',
        'MALDIVE ISLANDS': 'MALDIVES',
        'ST. MAARTIN': 'ST. MARTIN',  # Correct spelling
        'KOREA': 'SOUTH KOREA'        # Replace KOREA with SOUTH KOREA
    }
    # Apply replacements
    for key, value in replacements.items():
        if country == key:
            country = value
    # Assign 'UNSPECIFIED COUNTRY' to values containing sea/ocean-related terms
    sea_terms = ['sea', 'SEA', 'OCEAN', 'ocean', 'Ocean', 'Sea', 'BAY OF BENGAL', 'AFRICA']
    for term in sea_terms:
        if term in country:
            return 'UNSPECIFIED COUNTRY'
    # Remove invalid countries or regions
    invalid_countries = ['Diego Garcia', 'GULF OF ADEN', 'THE BALKANS', 'BRITISH ISLES', 'PERSIAN GULF', 'JOHNSTON ISLAND',
                         'JAVA', 'ROTAN', 'SAN DOMINGO', 'ST. MARTIN', 'NEVIS', 'GRAND CAYMAN', 'NETHERLANDS ANTILLES',
                         'NORTHERN MARIANA ISLANDS', 'ASIA']
    if country in invalid_countries:
        return 'UNSPECIFIED COUNTRY'
    # Convert all to uppercase
    country = country.upper()
# Group by 'Species' and 'Country', and count the occurrences
shark_attacks_count = shark_data.groupby(['Species', 'Country']).size().reset_index(name='Count')
# Sort the DataFrame by the Count in descending order
shark_attacks_count = shark_attacks_count.sort_values(by='Count', ascending=False)
shark_attacks_count

Unnamed: 0,Species,Country,Count
536,Invalid,USA,685
451,Invalid,AUSTRALIA,335
416,Great White Shark,USA,229
369,Great White Shark,AUSTRALIA,184
410,Great White Shark,SOUTH AFRICA,171
...,...,...,...
280,"Broadnose sevengill shark, 1.5 m",AUSTRALIA,1
282,Bronze whaler 1.5m,AUSTRALIA,1
283,Bronze whaler 2.5m,NEW ZEALAND,1
286,"Bronze whaler shark, 1.5 m",AUSTRALIA,1


In [5]:
## Hypothesis 4 
## Males have higher chances than females of being attacked by a shark.  

# Loading Data
import pandas as pd
shark_data =pd.read_excel('https://www.sharkattackfile.net/spreadsheets/GSAF5.xls')
shark_data.head()

# Cleaning: Sex
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Male" if "m" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Female" if "f" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Invalid" if "nan" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Invalid" if "n" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Invalid" if "lli" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Invalid" if "M x 2" in str(x).lower() else x)
shark_data["Sex"] = shark_data["Sex"].apply(lambda x: "Invalid" if "." in str(x).lower() else x)

# Create a pivot table with 'Sex ' as the index and count the number of shark attacks for each category
pivot_table = pd.pivot_table(shark_data,index='Sex',values='Case Number', aggfunc='count')
pivot_table.columns = ['Number of Attacks']
print(pivot_table)

         Number of Attacks
Sex                       
Female                 744
Invalid                580
Male                  5474
