In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from geopy.distance import geodesic


In [6]:
# Load dataset
data = pd.read_csv('DataS1/jaguar_movement_data.csv')

data.head()

  data = pd.read_csv('DataS1/jaguar_movement_data.csv')


Unnamed: 0,Event_ID,timestamp,location.long,location.lat,individual.taxon.canonical.name,tag.local.identifier,individual.local.identifier (ID),study.name,country
0,1.0,6/15/10 22:43,-58.030128,-23.326947,Panthera onca,0-333005,1,Humid Chaco,Paraguay
1,2.0,6/16/10 2:52,-58.030643,-23.328427,Panthera onca,0-333005,1,Humid Chaco,Paraguay
2,3.0,6/16/10 22:36,-58.030472,-23.327311,Panthera onca,0-333005,1,Humid Chaco,Paraguay
3,4.0,6/17/10 16:42,-58.027983,-23.309952,Panthera onca,0-333005,1,Humid Chaco,Paraguay
4,5.0,6/17/10 20:37,-58.027747,-23.310006,Panthera onca,0-333005,1,Humid Chaco,Paraguay


In [7]:
data_cp = data.copy()
data_cp.isnull().sum()

Event_ID                            0
timestamp                           0
location.long                       0
location.lat                        0
individual.taxon.canonical.name     0
tag.local.identifier                0
individual.local.identifier (ID)    0
study.name                          0
country                             0
dtype: int64

## Fixing features names

In [8]:
# Fix features names Event_ID,timestamp,location.long,location.lat,individual.taxon.canonical.name,tag.local.identifier,individual.local.identifier (ID),study.name,country
data_cp = data_cp.rename(columns={'Event_ID':'event_id','timestamp':'timestamp','location.long':'location_long','location.lat':'location_lat','individual.taxon.canonical.name':'individual_taxon_canonical_name','tag.local.identifier':'tag_local_identifier','individual.local.identifier (ID)':'individual_local_identifier_ID','study.name':'study_name','country':'country'})

# Print features names
data_cp.columns


Index(['event_id', 'timestamp', 'location_long', 'location_lat',
       'individual_taxon_canonical_name', 'tag_local_identifier',
       'individual_local_identifier_ID', 'study_name', 'country'],
      dtype='object')

# Preprocess timestamp

In [9]:
# Parsing the timestamp column into a datetime format
data_cp['timestamp'] = pd.to_datetime(data_cp['timestamp'])

# Extract time of day, day of the week, month, etc., for temporal features.
data_cp['hour'] = data_cp['timestamp'].dt.hour
data_cp['day'] = data_cp['timestamp'].dt.day
data_cp['month'] = data_cp['timestamp'].dt.month
data_cp['year'] = data_cp['timestamp'].dt.year
data_cp['dayofweek'] = data_cp['timestamp'].dt.dayofweek
data_cp['date'] = data_cp['timestamp'].dt.date

# Show unique values for day, month and year
# print(data_cp['day'].unique())
# print(data_cp['month'].unique())
# print(data_cp['year'].unique())
# print(data_cp['country'].unique())

  data_cp['timestamp'] = pd.to_datetime(data_cp['timestamp'])


## Identify Outliers

In [10]:
# data_cp = data_cp.drop(['event_id', 'individual_local_identifier_ID', 'tag_local_identifier'], axis=1)

data_cp.columns

Index(['event_id', 'timestamp', 'location_long', 'location_lat',
       'individual_taxon_canonical_name', 'tag_local_identifier',
       'individual_local_identifier_ID', 'study_name', 'country', 'hour',
       'day', 'month', 'year', 'dayofweek', 'date'],
      dtype='object')

# Geospatial Data Processing

## Calculating distances between consecutive events.
### Calculating the distance between points using the Haversine Formula and group the data by tag_local_identifier

In [11]:
data_fe = data_cp.copy()

data_fe['timestamp'] = pd.to_datetime(data_fe['timestamp'])

# Define Haversine formula for distance calculation
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

# Function to calculate distance only for the same animal
def calculate_distances_for_animal(group):
    group = group.sort_values('timestamp')
    group['distance'] = haversine(
        group['location_lat'].shift(),
        group['location_long'].shift(),
        group['location_lat'],
        group['location_long']
    )
    return group

data_fe = data_fe.groupby('tag_local_identifier').apply(calculate_distances_for_animal)

# Convert distance to kilometers
data_fe['distance'] = data_fe['distance'] / 1000
data_fe['distance'] = data_fe['distance'].round(3)

data_fe.head()
# data_fe.to_csv('DataS1/jaguar_movement_data_cleaned.csv', index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,event_id,timestamp,location_long,location_lat,individual_taxon_canonical_name,tag_local_identifier,individual_local_identifier_ID,study_name,country,hour,day,month,year,dayofweek,date,distance
tag_local_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
35957,131870,131871.0,2015-10-12 00:00:00,-57.5034,-16.881275,Panthera onca,35957,117,Jaguar_Taiama,Brazil,0,12,10,2015,0,2015-10-12,
35957,131871,131872.0,2015-10-12 01:00:00,-57.503355,-16.881304,Panthera onca,35957,117,Jaguar_Taiama,Brazil,1,12,10,2015,0,2015-10-12,0.006
35957,131872,131873.0,2015-10-12 02:01:00,-57.503327,-16.881216,Panthera onca,35957,117,Jaguar_Taiama,Brazil,2,12,10,2015,0,2015-10-12,0.01
35957,131873,131874.0,2015-10-12 03:00:00,-57.503302,-16.88122,Panthera onca,35957,117,Jaguar_Taiama,Brazil,3,12,10,2015,0,2015-10-12,0.003
35957,131874,131875.0,2015-10-12 04:00:00,-57.503297,-16.881144,Panthera onca,35957,117,Jaguar_Taiama,Brazil,4,12,10,2015,0,2015-10-12,0.008


# Creating velocity, direction, and movement features based on location data (location.long, location.lat).

In [12]:
# Creating velocity, direction, and movement features based on location data (location.long, location.lat).
# time_diff = (data_cp['timestamp'] - data_cp['timestamp'].shift()).dt.total_seconds()

# Ensuring the distance is in meters and the time differece is in seconds
time_diff = data_fe['time_diff'] = data_fe['timestamp'].diff().dt.total_seconds() / 3600

data_fe['distance'] = data_fe['distance'] * 1000
time_diff_seconds = data_fe['time_diff'] * 3600

print(data_fe['time_diff'].head())

data_fe['velocity'] = data_fe['distance'] / time_diff_seconds
data_fe['velocity'] = data_fe['velocity'].round(4)
data_fe = data_fe.dropna(subset=[ 'distance', 'velocity'])

data_fe['direction'] = np.arctan2(data_fe['location_long'] - data_fe['location_long'].shift(), data_fe['location_lat'] - data_fe['location_lat'].shift())
data_fe['movement'] = np.where(data_fe['velocity'] > 0, 'moving', 'not_moving')

# data_cp = data_cp.dropna()
print(data_fe[['distance', 'time_diff', 'velocity']].head())

tag_local_identifier        
35957                 131870         NaN
                      131871    1.000000
                      131872    1.016667
                      131873    0.983333
                      131874    1.000000
Name: time_diff, dtype: float64
                             distance  time_diff  velocity
tag_local_identifier                                      
35957                131871       6.0   1.000000    0.0017
                     131872      10.0   1.016667    0.0027
                     131873       3.0   0.983333    0.0008
                     131874       8.0   1.000000    0.0022
                     131875       5.0   1.000000    0.0014


## Derive acceleration or changes in movement direction.

In [13]:
# Calculate acceleration as the change in velocity over time
data_fe['acceleration'] = data_fe['velocity'].diff() / data_fe['time_diff']

# Calculate change in direction
data_fe['change_in_direction'] = data_fe['direction'].diff()

# Drop rows with NaN values in acceleration and change_in_direction
data_fe = data_fe.dropna(subset=['acceleration', 'change_in_direction'])

# Round the values for better readability
data_fe['acceleration'] = data_fe['acceleration'].round(4)
data_fe['change_in_direction'] = data_fe['change_in_direction'].round(4)

# Display the first few rows of the updated dataframe
data_fe[['acceleration', 'change_in_direction']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,acceleration,change_in_direction
tag_local_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35957,131873,-0.0019,1.4025
35957,131874,0.0014,-1.6508
35957,131875,-0.0008,1.9655
35957,131876,0.0139,-1.0905
35957,131877,-0.0147,1.369


## Encoding Categorical Variables

In [14]:
#Encode Categorical Variables
# data_encoded = pd.get_dummies(data_fe, columns=['individual_taxon_canonical_name', 'study_name', 'country', 'movement'])
# data_encoded.head()

## Now we want to get temporal features from Paraguay, Brazil, Costa Rica, Argentina and Mexico between 1/1/1999 and 31/12/2016

In [33]:
# Get World  Climate data
import kagglehub

path = kagglehub.dataset_download("christopherlemke/monthly-climat-reports-from-stations-worldwide")

reports_data = pd.read_csv(path + '/dwd-cdc_CLIMAT_reports_stations_ww.csv')
stations_data = pd.read_csv(path + '/dwd-cdc_station_data_ww.csv')

# Save the data to a csv file
reports_data.to_csv('DataS1/monthly-climat-reports-from-stations-worldwide.csv', index=False)
stations_data.to_csv('DataS1/dwd-cdc_station_data_ww.csv', index=False)

# print(reports_data.head())
# print(stations_data.head())

# print("Path to dataset files:", path)



### Preprocessing Reports data

In [50]:
# Remove columns
reports_data_dropped = reports_data.drop(['Po', 'Po.1', 'P', 'P.1', 'T.1', 'st', 'st.1', 
                                          'Tx.1', 'Tn.1', 
                                          'e', 'e.1', 
                                          'R1.1', 'nr.1', 'S1.1', 'Rd', 'mp', 'mT', 'mTx', 'mTn', 'me', 'mR', 'mS', 
                                            #   'sn', 'sn.1', 'sn.2', 'sn.3', 'sn.4', 'sn.5', 'sn.6', 'sn.7', 'sn.8', 'sn.9',
                                            # 'G1', 'G1.1', 'G1.2', 'G1.3', 'G1.4', 'G1.5', 'G1.6', 'G1.7', 'G1.8',
                                            # 'G2', 'G2.1', 'G2.2', 'G2.3', 'G2.4', 'G2.5', 'G2.6', 'G2.7', 'G2.8', 'G2.9',
                                            # 'G3', 'G3.1', 'G3.2', 'G3.3', 'G3.4', 'G3.5', 'G3.6', 'G3.7', 'G3.8', 'G3.9',
                                            # 'G4', 'G4.1', 'G4.2', 'G4.3', 'G4.4', 'G4.5', 'G4.6', 'G4.7',
                                            'Yb', 'Yc', 'P', 'YP', 'YR', 'YS', 'YT', 'YTx',
                                            'Ye', 'G4', 'Txd', 'yx', 'Tnd', 'Tax', 'Tan' ], axis=1)

# Drop columns with no data
reports_data_dropped = reports_data_dropped.dropna(axis=1, how='all')

# Rename columns
reports_data_renamed = reports_data_dropped.rename(columns={'IIiii':'station_id', 'T':'monthly_mean_air_temperature', 
                                            'Tx':'mean_daily_maximum_air_temperature', 
                                            'Tn':'mean_daily_minimum_air_temperature', 
                                            'R1':'total_precipitation_month', 'S1':'total_sunshine_month', 
                                            'ps':'percentage_total_sunshine_duration_relative_normal', 
                                            'P0':'monthly_mean_pressure_station_level', 'e':'mean_vapor_pressure_month', 
                                            'nr':'number_days_month_precipitation', 
                                            'yP':'missing_years_air_pressure', 'yR':'missing_years_precipitation', 'yS':'missing_years_sunshine_duration', 
                                            'yT':'missing_years_mean_air_temperature', 'yTx':'missing_years_mean_extreme_air_temperature', 'ye':'missing_years_vapor_pressure', 
                                            'T25':'days_month_maximum_air_temperature_25', 'T30':'days_month_maximum_air_temperature_30', 
                                            'T35':'days_month_maximum_air_temperature_35', 'T40':'days_month_maximum_air_temperature_40', 
                                            'Tn0':'days_month_minimum_air_temperature_0', 'Tx0':'days_month_maximum_air_temperature_0', 'R01':'days_month_precipitation_1', 
                                            'R05':'days_month_precipitation_5', 'R10':'days_month_precipitation_10', 'R50':'days_month_precipitation_50', 
                                            'R100':'days_month_precipitation_100', 'R150':'days_month_precipitation_150', 's00':'days_month_snow_depth_0', 
                                            's01':'days_month_snow_depth_1', 's10':'days_month_snow_depth_10', 's50':'days_month_snow_depth_50', 'f10':'days_month_wind_speed_10', 
                                            'f20':'days_month_wind_speed_20', 'f30':'days_month_wind_speed_30', 'V1':'days_month_visibility_50', 'V2':'days_month_visibility_100', 
                                            'V3':'days_month_visibility_1000', 'yn': 'day_lowest_daily_mean_air_temperature_month', 'yax': 'day_highest_daily_mean_air_temperature_month', 'yan':'day_lowest_air_tempreature_month',
                                            'Rx':'highest_daily_amount_precipitation_month_tenths_mm',  'yr':'day_highest_daily_amount_precipitation_month'})

# Remove rows with missing values in [year]
reports_data_renamed = reports_data_renamed.dropna(subset=['year'])

# Enconde Year, Month, and Station ID to integer
reports_data_renamed['year'] = reports_data_renamed['year'].astype(int)
reports_data_renamed['month'] = reports_data_renamed['month'].astype(int)
reports_data_renamed['station_id'] = reports_data_renamed['station_id'].astype(int)

# # Show null values
# print(reports_data_renamed.isnull().sum())

reports_data_renamed.to_csv('DataS1/monthly-climat-reports-from-stations-worldwide-cleaned.csv', index=False)

reports_data_renamed.head()


year               0
month              0
station_id         0
G1             47554
G1.1           38840
               ...  
Dgr           281442
G4.7          365352
iy            366752
Gx            366044
Gn            366069
Length: 91, dtype: int64


Unnamed: 0,year,month,station_id,G1,G1.1,G1.2,sn,monthly_mean_air_temperature,G1.3,sn.1,...,iw,fx,yfx,G4.6,Dts,Dgr,G4.7,iy,Gx,Gn
0,2013,4,1001,1.0,2.0,3.0,1.0,42.0,4.0,1.0,...,,,,,,,,,,
1,2013,4,1007,1.0,2.0,3.0,1.0,92.0,4.0,1.0,...,,,,,,,,,,
2,2013,4,1008,1.0,2.0,3.0,1.0,93.0,4.0,1.0,...,,,,,,,,,,
3,2013,4,1025,1.0,2.0,3.0,0.0,10.0,4.0,0.0,...,,,,,,,,,,
4,2013,4,1026,1.0,2.0,3.0,0.0,12.0,4.0,0.0,...,,,,,,,,,,


In [56]:
stations_data.dtypes

# Rename columns
# 0: Station ID, 1: Station Name, 2: Latitude, 3: Longitude, 4:Height, 5: Country
stations_data = stations_data.rename(columns={'0':'station_id', '1':'station_name', '2':'latitude', '3':'longitude', '4':'height', '5':'country'})

stations_data.head()

Unnamed: 0,station_id,station_name,latitude,longitude,height,country
0,1001,Jan Mayen,70.94,-8.67,9,Norway ...
1,1005,Isfjord Radio,78.06,13.63,9,Norway ...
2,1007,Ny-Alesund,78.92,11.93,8,Norway ...
3,1008,Svalbard,78.25,15.5,27,Norway ...
4,1025,Tromso/Langnes,69.68,18.91,9,Norway ...


In [16]:
# avg_prec = pd.read_csv('DataS1/average-precipitation-per-year.csv')
# avg_prec.drop(['Code'], axis=1, inplace=True)
# avg_prec.rename(columns={'Entity':'country', 'Year':'year'}, inplace=True)
# avg_prec.head()

# # Merge the two datasets by Country
# data_merged = pd.merge(data_fe, avg_prec, on=['country', 'year'], how='left')
# data_merged.head()

# # Save
# data_merged.to_csv('DataS1/jaguar_movement_data_cleaned.csv', index=False)

2. Feature Engineering
 Environmental Features:
 Integrate meteorological data (e.g., temperature, rainfall) if available.
 Integrate terrain data (e.g., elevation, land cover type) if available.
 Temporal Features:
 Extract time of day, day of week, and season from the timestamp.
 Identify specific behavioral periods like night vs. day.<br><br>

3. Labeling the Data
 Behavior Labeling:
 Manually label a subset of data with different behaviors (e.g., movement, hunting, resting).
 Consider cross-referencing domain knowledge or complementary data to infer labels.

4. Model Selection
 Choose Algorithms:
 Evaluate models like Random Forest, Gradient Boosting Machines (e.g., XGBoost), or neural networks (RNN/LSTM for sequential data).
 Decide on the best model based on data type and problem complexity.
 Split Data:
 Split the data into training and testing sets (e.g., 80/20 or 70/30 split).

5. Model Training and Evaluation
 Train Models:
 Train selected models using the preprocessed data.
 Use cross-validation to tune model parameters.
 Evaluate Model:
 Evaluate performance using metrics like accuracy, precision, recall, and F1-score.
 If the dataset is imbalanced, apply techniques like SMOTE to balance it.

6. Cross-Referencing with Complementary Data
 Integrate Meteorological Data:
 Merge weather data (e.g., temperature, humidity) with animal tracking data.
 Integrate Terrain Data:
 Merge terrain data (e.g., elevation, land use) with animal tracking data.
 Behavioral Biology Data:
 Incorporate knowledge from related behavioral biology studies (if available).

7. Model Deployment
 Model Evaluation: Confirm that the model generalizes well to new or unseen data.
 Deploy Model:
 Prepare the model for deployment in a real-time or batch setting for wildlife tracking.
 Implement a user interface or tool to apply the model to new data.