In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from geopy.distance import geodesic
import sqlite3


In [89]:
SAMPLE_SIZE = 5000

# Load dataset
data = pd.read_csv('DataS1/jaguar_movement_data.csv')
conn = sqlite3.connect('jaguar_data.db')

data = data.sample(SAMPLE_SIZE) 

data.head()

  data = pd.read_csv('DataS1/jaguar_movement_data.csv')


Unnamed: 0,Event_ID,timestamp,location.long,location.lat,individual.taxon.canonical.name,tag.local.identifier,individual.local.identifier (ID),study.name,country
15584,15585.0,11/20/13 11:00,-56.319152,-19.932593,Panthera onca,33299,15,jaguar_Oncafari Project,Brazil
98275,98276.0,3/30/15 4:01,-57.380348,-16.998639,Panthera onca,34770,81,Jaguar_Taiama,Brazil
21923,21924.0,2/28/15 19:00,-57.418636,-16.958635,Panthera onca,36317,18,Jaguar_Taiama,Brazil
123637,123638.0,10/14/08 23:00,-56.895966,-19.462106,Panthera onca,152200,111,Sao Bento,Brazil
36469,36470.0,7/28/15 13:00,-56.274572,-19.904187,Panthera onca,36112,25,jaguar_Oncafari Project,Brazil


In [90]:
data_cp = data.copy()
data_cp.isnull().sum()

Event_ID                            0
timestamp                           0
location.long                       0
location.lat                        0
individual.taxon.canonical.name     0
tag.local.identifier                0
individual.local.identifier (ID)    0
study.name                          0
country                             0
dtype: int64

## Fixing features names

In [91]:
# Fix features names Event_ID,timestamp,location.long,location.lat,individual.taxon.canonical.name,tag.local.identifier,individual.local.identifier (ID),study.name,country
data_cp = data_cp.rename(columns={'Event_ID':'event_id','timestamp':'timestamp','location.long':'location_long','location.lat':'location_lat','individual.taxon.canonical.name':'individual_taxon_canonical_name','tag.local.identifier':'tag_local_identifier','individual.local.identifier (ID)':'individual_local_identifier_ID','study.name':'study_name','country':'country'})

# Print features names
data_cp.columns


Index(['event_id', 'timestamp', 'location_long', 'location_lat',
       'individual_taxon_canonical_name', 'tag_local_identifier',
       'individual_local_identifier_ID', 'study_name', 'country'],
      dtype='object')

# Preprocess timestamp

In [92]:
# Parsing the timestamp column into a datetime format
data_cp['timestamp'] = pd.to_datetime(data_cp['timestamp'])

# Extract time of day, day of the week, month, etc., for temporal features.
data_cp['hour'] = data_cp['timestamp'].dt.hour
data_cp['day'] = data_cp['timestamp'].dt.day
data_cp['month'] = data_cp['timestamp'].dt.month
data_cp['year'] = data_cp['timestamp'].dt.year
data_cp['dayofweek'] = data_cp['timestamp'].dt.dayofweek
data_cp['date'] = data_cp['timestamp'].dt.date

# Show unique values for day, month and year
# print(data_cp['day'].unique())
# print(data_cp['month'].unique())
# print(data_cp['year'].unique())
# print(data_cp['country'].unique())

  data_cp['timestamp'] = pd.to_datetime(data_cp['timestamp'])


## Identify Outliers

In [93]:
# data_cp = data_cp.drop(['event_id', 'individual_local_identifier_ID', 'tag_local_identifier'], axis=1)

data_cp.columns

Index(['event_id', 'timestamp', 'location_long', 'location_lat',
       'individual_taxon_canonical_name', 'tag_local_identifier',
       'individual_local_identifier_ID', 'study_name', 'country', 'hour',
       'day', 'month', 'year', 'dayofweek', 'date'],
      dtype='object')

# Geospatial Data Processing

## Calculating distances between consecutive events.
### Calculating the distance between points using the Haversine Formula and group the data by tag_local_identifier

In [94]:
data_fe = data_cp.copy()

data_fe['timestamp'] = pd.to_datetime(data_fe['timestamp'])

# Define Haversine formula for distance calculation
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

# Function to calculate distance only for the same animal
def calculate_distances_for_animal(group):
    group = group.sort_values('timestamp')
    group['distance'] = haversine(
        group['location_lat'].shift(),
        group['location_long'].shift(),
        group['location_lat'],
        group['location_long']
    )
    return group

data_fe = data_fe.groupby('tag_local_identifier').apply(calculate_distances_for_animal)

# Convert distance to kilometers
data_fe['distance'] = data_fe['distance'] / 1000
data_fe['distance'] = data_fe['distance'].round(3)

data_fe.head()
# data_fe.to_csv('DataS1/jaguar_movement_data_cleaned.csv', index=False)

  data_fe = data_fe.groupby('tag_local_identifier').apply(calculate_distances_for_animal)


Unnamed: 0_level_0,Unnamed: 1_level_0,event_id,timestamp,location_long,location_lat,individual_taxon_canonical_name,tag_local_identifier,individual_local_identifier_ID,study_name,country,hour,day,month,year,dayofweek,date,distance
tag_local_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
35957,131891,131892.0,2015-10-12 21:00:00,-57.500308,-16.881923,Panthera onca,35957,117,Jaguar_Taiama,Brazil,21,12,10,2015,0,2015-10-12,
35957,131939,131940.0,2015-10-14 23:00:00,-57.490634,-16.888566,Panthera onca,35957,117,Jaguar_Taiama,Brazil,23,14,10,2015,2,2015-10-14,1.267
35957,131945,131946.0,2015-10-15 05:00:00,-57.484631,-16.888427,Panthera onca,35957,117,Jaguar_Taiama,Brazil,5,15,10,2015,3,2015-10-15,0.639
35957,131984,131985.0,2015-10-16 20:00:00,-57.504718,-16.881299,Panthera onca,35957,117,Jaguar_Taiama,Brazil,20,16,10,2015,4,2015-10-16,2.279
35957,131989,131990.0,2015-10-17 02:01:00,-57.505744,-16.880184,Panthera onca,35957,117,Jaguar_Taiama,Brazil,2,17,10,2015,5,2015-10-17,0.165


# Creating velocity, direction, and movement features based on location data (location.long, location.lat).

3. Labeling the Data
 Behavior Labeling:
 Manually label a subset of data with different behaviors (e.g., movement, hunting, resting).
 Consider cross-referencing domain knowledge or complementary data to infer labels.

movement > 0.5: 'hunting' (highest priority)

0.1 < velocity ≤ 0.5: 'movement'

0.0 < velocity ≤ 0.1: 'slowing down'

≤ 0.0: 'resting'

In [95]:
# Creating velocity, direction, and movement features based on location data (location.long, location.lat).
# time_diff = (data_cp['timestamp'] - data_cp['timestamp'].shift()).dt.total_seconds()

# Ensuring the distance is in meters and the time differece is in seconds
time_diff = data_fe['time_diff'] = data_fe['timestamp'].diff().dt.total_seconds() / 3600

data_fe['distance'] = data_fe['distance'] * 1000
time_diff_seconds = data_fe['time_diff'] * 3600

# print(data_fe['time_diff'].head())

data_fe['velocity'] = data_fe['distance'] / time_diff_seconds
data_fe['velocity'] = data_fe['velocity'].round(4)
data_fe = data_fe.dropna(subset=[ 'distance', 'velocity'])

data_fe['direction'] = np.arctan2(data_fe['location_long'] - data_fe['location_long'].shift(), data_fe['location_lat'] - data_fe['location_lat'].shift())

data_fe['movement'] = np.where(
    data_fe['velocity'] > 0.5, 'hunting',
    np.where(
        data_fe['velocity'] > 0.1, 'movement',
        np.where(
            data_fe['velocity'] > 0.0, 'slowing down',
            'resting'  # Velocity <= 0.0
        )
    )
)
# data_cp = data_cp.dropna()
print(data_fe[['velocity', 'movement']])
# print(data_fe.head())

                             velocity      movement
tag_local_identifier                               
35957                131939    0.0070  slowing down
                     131945    0.0296  slowing down
                     131984    0.0162  slowing down
                     131989    0.0076  slowing down
                     132018    0.0146  slowing down
...                               ...           ...
TIAGO                106041    0.0252  slowing down
                     106044    0.0001  slowing down
                     106045    0.0063  slowing down
                     106116    0.0046  slowing down
                     106157    0.0088  slowing down

[4890 rows x 2 columns]


## Derive acceleration or changes in movement direction.

In [96]:
# Calculate acceleration as the change in velocity over time
data_fe['acceleration'] = data_fe['velocity'].diff() / data_fe['time_diff']

# Calculate change in direction
data_fe['change_in_direction'] = data_fe['direction'].diff()

# Drop rows with NaN values in acceleration and change_in_direction
data_fe = data_fe.dropna(subset=['acceleration', 'change_in_direction'])

# Round the values for better readability
data_fe['acceleration'] = data_fe['acceleration'].round(4)
data_fe['change_in_direction'] = data_fe['change_in_direction'].round(4)

# Display the first few rows of the updated dataframe
data_fe[['acceleration', 'change_in_direction']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,acceleration,change_in_direction
tag_local_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35957,131984,-0.0003,-2.7774
35957,131989,-0.0014,0.486
35957,132018,0.0002,-0.3062
35957,132026,0.0005,0.1463
35957,132056,0.0003,3.0119


## Encoding Categorical Variables

In [97]:
# Encode tag_local_identifier to numerical values

data_fe['tag_local_identifier'] = pd.Categorical(data_fe['tag_local_identifier'])
data_fe['tag_local_identifier'] = data_fe['tag_local_identifier'].cat.codes

print(data_fe['tag_local_identifier'].unique())
# data_encoded = pd.get_dummies(data_fe, columns=['tag_local_identifier'])
# data_encoded.columns
# data_encoded['tag_local_identifier'].unique()

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103]


## Now we want to get temporal features from Paraguay, Brazil, Costa Rica, Argentina and Mexico between 1/1/1999 and 31/12/2016

In [98]:
# Get World  Climate data
import kagglehub

path = kagglehub.dataset_download("christopherlemke/monthly-climat-reports-from-stations-worldwide")

reports_data = pd.read_csv(path + '/dwd-cdc_CLIMAT_reports_stations_ww.csv')
stations_data = pd.read_csv(path + '/dwd-cdc_station_data_ww.csv')

# Save the data to a csv file
reports_data.to_csv('DataS1/monthly-climat-reports-from-stations-worldwide.csv', index=False)
stations_data.to_csv('DataS1/dwd-cdc_station_data_ww.csv', index=False)

print(reports_data.shape)
print(stations_data.shape)

reports_data = reports_data.sample(SAMPLE_SIZE)
# stations_data = stations_data.sample(SAMPLE_SIZE)

# print(reports_data.head())
# print(stations_data.head())

# print("Path to dataset files:", path)

(553652, 127)
(4169, 6)


### Preprocessing Reports data

In [99]:
def farenheit_to_celsius(farenheit):
    return (farenheit - 32) * 5.0/9.0

2. Feature Engineering
 Environmental Features:
 Integrate meteorological data (e.g., temperature, rainfall) if available.
 Integrate terrain data (e.g., elevation, land cover type) if available.
 Temporal Features:
 Extract time of day, day of week, and season from the timestamp.
 Identify specific behavioral periods like night vs. day.<br><br>

In [100]:
# Remove columns
reports_data_dropped = reports_data.drop(['Po', 'Po.1', 'P', 'P.1', 'T.1', 'st', 'st.1', 
                                          'Tx.1', 'Tn.1', 
                                          'e', 'e.1', 
                                          'R1.1', 'nr.1', 'S1.1', 'Rd', 'mp', 'mT', 'mTx', 'mTn', 'me', 'mR', 'mS', 
                                            #   'sn', 'sn.1', 'sn.2', 'sn.3', 'sn.4', 'sn.5', 'sn.6', 'sn.7', 'sn.8', 'sn.9',
                                            # 'G1', 'G1.1', 'G1.2', 'G1.3', 'G1.4', 'G1.5', 'G1.6', 'G1.7', 'G1.8',
                                            # 'G2', 'G2.1', 'G2.2', 'G2.3', 'G2.4', 'G2.5', 'G2.6', 'G2.7', 'G2.8', 'G2.9',
                                            # 'G3', 'G3.1', 'G3.2', 'G3.3', 'G3.4', 'G3.5', 'G3.6', 'G3.7', 'G3.8', 'G3.9',
                                            # 'G4', 'G4.1', 'G4.2', 'G4.3', 'G4.4', 'G4.5', 'G4.6', 'G4.7',
                                            'Yb', 'Yc', 'P', 'YP', 'YR', 'YS', 'YT', 'YTx',
                                            'Ye', 'G4', 'Txd', 'yx', 'Tnd', 'Tax', 'Tan' ], axis=1)

# Drop columns with no data
reports_data_dropped = reports_data_dropped.dropna(axis=1, how='all')

# Rename columns
reports_data_renamed = reports_data_dropped.rename(columns={'IIiii':'station_id', 'T':'monthly_mean_air_temperature', 
                                            'Tx':'mean_daily_maximum_air_temperature', 
                                            'Tn':'mean_daily_minimum_air_temperature', 
                                            'R1':'total_precipitation_month', 'S1':'total_sunshine_month', 
                                            'ps':'percentage_total_sunshine_duration_relative_normal', 
                                            'P0':'monthly_mean_pressure_station_level', 'e':'mean_vapor_pressure_month', 
                                            'nr':'number_days_month_precipitation', 
                                            'yP':'missing_years_air_pressure', 'yR':'missing_years_precipitation', 'yS':'missing_years_sunshine_duration', 
                                            'yT':'missing_years_mean_air_temperature', 'yTx':'missing_years_mean_extreme_air_temperature', 'ye':'missing_years_vapor_pressure', 
                                            'T25':'days_month_maximum_air_temperature_25', 'T30':'days_month_maximum_air_temperature_30', 
                                            'T35':'days_month_maximum_air_temperature_35', 'T40':'days_month_maximum_air_temperature_40', 
                                            'Tn0':'days_month_minimum_air_temperature_0', 'Tx0':'days_month_maximum_air_temperature_0', 'R01':'days_month_precipitation_1', 
                                            'R05':'days_month_precipitation_5', 'R10':'days_month_precipitation_10', 'R50':'days_month_precipitation_50', 
                                            'R100':'days_month_precipitation_100', 'R150':'days_month_precipitation_150', 's00':'days_month_snow_depth_0', 
                                            's01':'days_month_snow_depth_1', 's10':'days_month_snow_depth_10', 's50':'days_month_snow_depth_50', 'f10':'days_month_wind_speed_10', 
                                            'f20':'days_month_wind_speed_20', 'f30':'days_month_wind_speed_30', 'V1':'days_month_visibility_50', 'V2':'days_month_visibility_100', 
                                            'V3':'days_month_visibility_1000', 'yn': 'day_lowest_daily_mean_air_temperature_month', 'yax': 'day_highest_daily_mean_air_temperature_month', 'yan':'day_lowest_air_tempreature_month',
                                            'Rx':'highest_daily_amount_precipitation_month_tenths_mm',  
                                            # 'yr':'day_highest_daily_amount_precipitation_month'
                                            })

# Remove rows with missing values in [year]
reports_data_renamed = reports_data_renamed.dropna(subset=['year'])

# Enconde Year, Month, and Station ID to integer
reports_data_renamed['year'] = reports_data_renamed['year'].apply(lambda x: int(x) if pd.notnull(x) else x)
reports_data_renamed['month'] = reports_data_renamed['month'].apply(lambda x: int(x) if pd.notnull(x) else x)
reports_data_renamed['station_id'] = reports_data_renamed['station_id'].apply(lambda x: int(x) if pd.notnull(x) else x)
reports_data_renamed['sn'] = reports_data_renamed['sn'].apply(lambda x: int(x) if pd.notnull(x) else x)

# Convert temperature features to Celsius
# °C = (°F - 32) × 5/9
reports_data_renamed['monthly_mean_air_temperature'] = farenheit_to_celsius(reports_data_renamed['monthly_mean_air_temperature']).round(2)
reports_data_renamed['mean_daily_maximum_air_temperature'] =  farenheit_to_celsius(reports_data_renamed['mean_daily_maximum_air_temperature']).round(2)
reports_data_renamed['mean_daily_minimum_air_temperature'] = farenheit_to_celsius(reports_data_renamed['mean_daily_minimum_air_temperature']).round(2)

# Check for missing values in temperature features
# print(reports_data_renamed[['monthly_mean_air_temperature', 'mean_daily_maximum_air_temperature', 'mean_daily_minimum_air_temperature']].isnull().sum())

# Fill missing values with the mean
reports_data_renamed['monthly_mean_air_temperature'] = reports_data_renamed['monthly_mean_air_temperature'].fillna(reports_data_renamed['monthly_mean_air_temperature'].mean())
reports_data_renamed['mean_daily_maximum_air_temperature'] = reports_data_renamed['mean_daily_maximum_air_temperature'].fillna(reports_data_renamed['mean_daily_maximum_air_temperature'].mean())
reports_data_renamed['mean_daily_minimum_air_temperature'] = reports_data_renamed['mean_daily_minimum_air_temperature'].fillna(reports_data_renamed['mean_daily_minimum_air_temperature'].mean())

print(reports_data_renamed[['monthly_mean_air_temperature', 'mean_daily_maximum_air_temperature', 'mean_daily_minimum_air_temperature']].isnull().sum())

# # Show null values
# print(reports_data_renamed.isnull().sum())
reports_data_renamed.to_csv('DataS1/monthly-climat-reports-from-stations-worldwide-cleaned.csv', index=False)

reports_data_renamed.head()


monthly_mean_air_temperature          0
mean_daily_maximum_air_temperature    0
mean_daily_minimum_air_temperature    0
dtype: int64


Unnamed: 0,year,month,station_id,G1,G1.1,G1.2,sn,monthly_mean_air_temperature,G1.3,sn.1,...,iw,fx,yfx,G4.6,Dts,Dgr,G4.7,iy,Gx,Gn
530511,2004,6,72421,,2.0,3.0,0.0,102.22,4.0,0.0,...,4.0,,,6.0,7.0,0.0,,,,
466209,2019,11,89865,1.0,,3.0,1.0,24.44,4.0,1.0,...,,,,,,,,,,
22175,2006,9,20069,1.0,2.0,3.0,1.0,-2.78,4.0,1.0,...,,,,,,,,,,
402833,2009,8,6590,1.0,2.0,3.0,0.0,90.56,4.0,0.0,...,1.0,140.0,78.0,6.0,2.0,0.0,7.0,2.0,24.0,24.0
128774,2011,3,44225,1.0,2.0,3.0,1.0,95.56,4.0,1.0,...,1.0,180.0,21.0,,,,,,,


In [101]:
# stations_data.dtypes

# Rename columns
# 0: Station ID, 1: Station Name, 2: Latitude, 3: Longitude, 4:Height, 5: Country
stations_data_renamed = stations_data.rename(columns={'0':'station_id', '1':'station_name', '2':'latitude', '3':'longitude', '4':'height', '5':'country'})
stations_data_renamed['station_id'] = stations_data_renamed['station_id'].str.strip()
stations_data_renamed = stations_data_renamed[:-1]

# print(stations_data_renamed['station_id'].unique())


stations_data_renamed['station_id'] = (
    stations_data_renamed['station_id']
    .str.replace(r'\D', '', regex=True)  # Remove all non-digit characters
    .pipe(pd.to_numeric, errors='coerce')  # Convert to numeric, invalid become NaN
    .astype('Int64')  # Convert to pandas' nullable Int64 type
)


# stations_data_renamed['station_id'] = stations_data_renamed['station_id'].apply(lambda x: int(x) if pd.notnull(x) else x)
# stations_data_renamed.head()
# stations_data_renamed.info()

In [None]:
# Merge the reports and stations data by station_id
import sqlite3

data_merged = pd.merge(reports_data_renamed, stations_data_renamed, on='station_id')

# data_fe['country'].unique()

data_merged['country'] = data_merged['country'].str.strip()

# print(data_merged['country'].unique())

# data_merged['country'].isnull().sum()

# print(data_merged.columns)

month_daily_data = data_merged[['year', 'month', 'station_id', 'country', 'mean_daily_maximum_air_temperature', 'mean_daily_minimum_air_temperature', 'total_precipitation_month', 
'number_days_month_precipitation', 'total_sunshine_month', 'percentage_total_sunshine_duration_relative_normal', 'days_month_maximum_air_temperature_25', 
'days_month_maximum_air_temperature_30', 'days_month_maximum_air_temperature_35', 'days_month_maximum_air_temperature_40', 'days_month_minimum_air_temperature_0', 
'days_month_maximum_air_temperature_0', 'days_month_precipitation_1', 'days_month_precipitation_5', 'days_month_precipitation_10', 'days_month_precipitation_50', 
'days_month_precipitation_100', 'days_month_precipitation_150', 'days_month_snow_depth_0', 'days_month_snow_depth_1', 'days_month_snow_depth_10', 'days_month_snow_depth_50', 
'days_month_visibility_50', 'days_month_visibility_100', 'days_month_visibility_1000', 'day_lowest_air_tempreature_month', 'day_highest_daily_mean_air_temperature_month', 'day_lowest_air_tempreature_month', 'highest_daily_amount_precipitation_month_tenths_mm']]

# print(data_fe.shape)

# print(month_daily_data.head())

# data_merged.head()

conn = sqlite3.connect('jaguar_data.db')

# Save data to SQLite database
reports_data_renamed.to_sql('reports', conn, if_exists='replace', index=False)
stations_data_renamed.to_sql('stations', conn, if_exists='replace', index=False)
# data_merged.to_sql('reports_country', conn, if_exists='replace', index=False)
data_fe.to_sql('data_fe', conn, if_exists='replace', index=False)

print("Data successfully loaded into SQLite database!")

conn.close()

# Merge data_merged with data_fe only by the countries in data_fe
data_merged_countries = pd.merge(data_fe, month_daily_data, on='country')

# data_merged_countries = pd.merge(data_merged, data_fe, on='country', how='inner')
# data_merged_countries = data_merged[data_merged['country'].isin(data_fe['country'].unique())]
# data_merged_countries.head()

print("Database connection closed.")

# Save
data_merged_countries.to_csv('DataS1/jaguar_movement_with_countries_climate_data.csv', index=False)

Index(['year', 'month', 'station_id', 'G1', 'G1.1', 'G1.2', 'sn',
       'monthly_mean_air_temperature', 'G1.3', 'sn.1',
       'mean_daily_maximum_air_temperature', 'sn.2',
       'mean_daily_minimum_air_temperature', 'G1.4', 'G1.5',
       'total_precipitation_month', 'number_days_month_precipitation', 'G1.6',
       'total_sunshine_month',
       'percentage_total_sunshine_duration_relative_normal', 'G1.7', 'G1.8',
       'G2', 'G2.1', 'G2.2', 'G2.3', 'sn.3', 'G2.4', 'sn.4', 'sn.5', 'G2.5',
       'G2.6', 'G2.7', 'G2.8', 'G2.9', 'G3',
       'days_month_maximum_air_temperature_25',
       'days_month_maximum_air_temperature_30', 'G3.1',
       'days_month_maximum_air_temperature_35',
       'days_month_maximum_air_temperature_40', 'G3.2',
       'days_month_minimum_air_temperature_0',
       'days_month_maximum_air_temperature_0', 'G3.3',
       'days_month_precipitation_1', 'days_month_precipitation_5', 'G3.4',
       'days_month_precipitation_10', 'days_month_precipitation_50', 'G

4. Model Selection
 Choose Algorithms:
 Evaluate models like Random Forest, Gradient Boosting Machines (e.g., XGBoost), or neural networks (RNN/LSTM for sequential data).
 Decide on the best model based on data type and problem complexity.
 Split Data:
 Split the data into training and testing sets (e.g., 80/20 or 70/30 split).

In [113]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import shap
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Data Preparation with Country Considerations

# Defining target and features 

target = 'movement'
features = [col for col in data_fe.columns if col not in [target, 'event_id', 'timestamp', 'date']]
categorical_features = ['country', 'study_name']
numeric_features = [col for col in features if col not in categorical_features + [target]]
data_fe['stratify_col'] = data_fe['country'] + '_' + data_fe[target]

# Filter out classes with fewer than two samples
class_counts = data_fe['stratify_col'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
data_fe_filtered = data_fe[data_fe['stratify_col'].isin(valid_classes)]

# Splitting the data with stratification by both country and movement class
X_train, X_test, y_train, y_test = train_test_split(
    data_fe_filtered[features], 
    data_fe_filtered[target],
    test_size=0.2,
    stratify=data_fe_filtered['stratify_col'],
    random_state=42
)

# Model Selection
models = {
    # Handles categorical features natively, good for country-wise patterns
    'CatBoost': CatBoostClassifier(
        cat_features=categorical_features,
        auto_class_weights='Balanced',
        verbose=0,
        random_seed=42
    ),
    
    # Baseline for comparison
    'Random Forest': RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        random_state=42
    ),
    
    # Gradient boosting alternative
    'XGBoost': XGBClassifier(
        scale_pos_weight='balanced',
        enable_categorical=False,  # Requires pre-encoded categories
        random_state=42
    )
}

5. Model Training and Evaluation
 Train Models:
 Train selected models using the preprocessed data.
 Use cross-validation to tune model parameters.
 Evaluate Model:
 Evaluate performance using metrics like accuracy, precision, recall, and F1-score.
 If the dataset is imbalanced, apply techniques like SMOTE to balance it.

In [114]:
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Special handling for CatBoost
    if name == 'CatBoost':
        # Convert categorical columns to string type
        X_train[categorical_features] = X_train[categorical_features].astype(str)
        X_test[categorical_features] = X_test[categorical_features].astype(str)
        
        model.fit(X_train, y_train)
    else:
        # One-hot encode categorical features for other models
        X_train_encoded = pd.get_dummies(X_train, columns=categorical_features)
        X_test_encoded = pd.get_dummies(X_test, columns=categorical_features)
        
        # Align columns between train and test
        X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
        
        model.fit(X_train_encoded, y_train)
    
    # Cross-validate with regional stratification
    cv_scores = cross_val_score(
        model, X_train, y_train if name == 'CatBoost' else X_train_encoded,
        cv=5, scoring='f1_weighted',
        groups=data.loc[X_train.index, 'country']  # Keep countries together in folds
    )
    print(f"CV F1-weighted: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")
    
    # Evaluate on test set
    if name == 'CatBoost':
        y_pred = model.predict(X_test)
    else:
        y_pred = model.predict(X_test_encoded)
    
    print("Test Set Performance:")
    print(classification_report(y_test, y_pred))


=== CatBoost ===


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="Panthera onca": Cannot convert 'b'Panthera onca'' to float

6. Cross-Referencing with Complementary Data
 Integrate Meteorological Data:
 Merge weather data (e.g., temperature, humidity) with animal tracking data.
 Integrate Terrain Data:
 Merge terrain data (e.g., elevation, land use) with animal tracking data.
 Behavioral Biology Data:
 Incorporate knowledge from related behavioral biology studies (if available).

7. Model Deployment
 Model Evaluation: Confirm that the model generalizes well to new or unseen data.
 Deploy Model:
 Prepare the model for deployment in a real-time or batch setting for wildlife tracking.
 Implement a user interface or tool to apply the model to new data.