In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb


In [4]:
#Generating background data to use a controls (non-encounter data)
attack_df = pd.read_excel('bear_attack_processed_data_final_ver..xlsx')

latitude_min, latitude_max = attack_df['Latitude'].min(), attack_df['Latitude'].max()
longitude_min, longitude_max = attack_df['Longitude'].min(), attack_df['Longitude'].max()

print("latitude bounds:",latitude_min, latitude_max)
print("Longitude bounds",longitude_min, longitude_max)

#Number of non-encounter data points 
n_background = 43654

#Creating random coordinates 
background_lat = np.random.uniform(latitude_min, latitude_max, n_background)
background_lon = np.random.uniform (longitude_min, longitude_max, n_background)

#Creating random dates
start_date = pd.to_datetime('2000-01-01')
end_date = pd.to_datetime ('2025-01-01')
all_dates =pd.date_range(start = start_date,end=end_date)
random_dates = np.random.choice (all_dates, size= n_background, replace= True)
random_dates =pd.to_datetime(random_dates).normalize()

#Random species
species =np.random.choice(['Black Bear', 'Grizzly Bear'], size=n_background, p= [0.93, 0.07])


#Generating background non-encounter dataframe
background_data = pd.DataFrame({
    'Species Common Name': species,
    'Date': random_dates,
    'Latitude':background_lat,
    'Longitude': background_lon,
    'Attack': 0
    
})

background_data.head()


latitude bounds: 43.93289 69.083333
Longitude bounds -139.5 -53.9280555


Unnamed: 0,Species Common Name,Date,Latitude,Longitude,Attack
0,Black Bear,2015-07-30,58.923882,-123.204773,0
1,Black Bear,2016-03-13,63.65992,-95.043864,0
2,Black Bear,2011-01-21,68.336435,-72.147707,0
3,Black Bear,2005-04-23,44.830905,-53.991614,0
4,Black Bear,2023-02-06,55.830796,-84.076517,0


In [42]:
#Combining non-encounter and encounter data 
#Attack = 0 means non-encounter, 1 means encounter 
full_df =pd.concat([attack_df, background_data], ignore_index=True)
full_df

Unnamed: 0,Species Common Name,Date,Latitude,Longitude,Attack
0,Grizzly Bear,2010-01-15,50.620000,-116.070000,1
1,Grizzly Bear,2010-03-26,51.496800,-115.928100,1
2,Grizzly Bear,2010-03-27,51.496800,-115.928100,1
3,Grizzly Bear,2010-03-27,51.496800,-115.928100,1
4,Grizzly Bear,2010-04-02,51.496800,-115.928100,1
...,...,...,...,...,...
87303,Black Bear,2011-09-18,45.456470,-139.112535,0
87304,Black Bear,2018-01-01,62.182221,-106.855233,0
87305,Black Bear,2006-03-07,46.568685,-71.092022,0
87306,Black Bear,2021-04-25,55.565675,-68.391343,0


In [43]:
print(full_df.columns)


Index(['Species Common Name', 'Date', 'Latitude', 'Longitude', 'Attack'], dtype='object')


In [56]:
#Encoding species name as number (Black =0, Grizzly =1)
full_df['Species Code'] =full_df['Species Common Name'].map({'Black Bear':0, 'Grizzly Bear':1})

#Coverting date to int
full_df['Day of Year']= full_df['Date'].dt.dayofyear
#full_df=full_df.drop (columns ='Date')


#Defining Features as X and y
X= full_df[['Species Code', 'Day of Year', 'Latitude', 'Longitude']]
y= full_df['Attack']

In [57]:
full_df

Unnamed: 0,Species Common Name,Date,Latitude,Longitude,Attack,Species Code,Day of Year
0,Grizzly Bear,2010-01-15,50.620000,-116.070000,1,1.0,15
1,Grizzly Bear,2010-03-26,51.496800,-115.928100,1,1.0,85
2,Grizzly Bear,2010-03-27,51.496800,-115.928100,1,1.0,86
3,Grizzly Bear,2010-03-27,51.496800,-115.928100,1,1.0,86
4,Grizzly Bear,2010-04-02,51.496800,-115.928100,1,1.0,92
...,...,...,...,...,...,...,...
87303,Black Bear,2011-09-18,45.456470,-139.112535,0,0.0,261
87304,Black Bear,2018-01-01,62.182221,-106.855233,0,0.0,1
87305,Black Bear,2006-03-07,46.568685,-71.092022,0,0.0,66
87306,Black Bear,2021-04-25,55.565675,-68.391343,0,0.0,115


In [58]:
#Splitting into training and testing data 
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)


In [59]:
#Create and train XGBoost Model
model = xgb.XGBClassifier(
    n_estimators = 300,
    learning_rate =0.05,
    max_depth =4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
#Make Predictions
y_pred = model.predict(X_test)

#Converting to probabilities
y_prob=model.predict_proba(X_test)[:,1]

In [75]:
#Viewing machine learning results 
X_test_reset= X_test.reset_index(drop=True)
y_test_reset= y_test.reset_index(drop=True)

results_df = pd.concat([
    X_test_reset,
    y_test_reset.rename("Actual Observations"),
    pd.Series (y_pred, name='Predictions'),
    pd.Series (y_prob, name='Probabilities')
    
], axis=1)
results_df.head()

Unnamed: 0,Species Code,Day of Year,Latitude,Longitude,Actual Observations,Predictions,Probabilities
0,0.0,337,69.082802,-110.320475,0,0,0.000633
1,0.0,304,51.532237,-125.945363,0,0,0.014964
2,0.0,24,54.597505,-129.757211,0,0,0.000753
3,0.0,147,51.4968,-115.9281,1,1,0.996604
4,0.0,294,62.911479,-58.256979,0,0,6.2e-05


In [None]:
#Creating Folium Model for Visualization
#Importing libraries for mapping
import folium 
from folium.plugins import HeatMap, MarkerCluster
import math 



#Mapping incidents 
map = folium.Map(location=[full_df['Latitude'].mean(), full_df['Longitude'].mean()], zoom_start=11,tiles='cartodbpositron')



#Add heatmap layer onto map
HeatMap(heat_data).add_to(map)
map



In [None]:
#Evaluate Model
accuracy =accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")



Accuracy: 0.9945
