# This project involves developing a machine learning model to predict the number of turtles that will be rescued at each rescue site as part of the Local Ocean Conservation's by-catch release program. The following are the starter codes for the project.

# Step 1: Importing the necessary libraries and data
In this step, we import the necessary libraries for the project, including Pandas for data manipulation, NumPy for numerical computations, scikit-learn for machine learning, and Matplotlib for data visualization. We also import the rescue data in CSV format into a Pandas dataframe for analysis.

In [26]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Importing the data
df = pd.read_csv('train.csv')


# Step 2: Exploratory Data Analysis
In this step, we explore the data to gain insights into its features and structure. We use the head(), describe(), and info() methods to display the first few rows, statistical summary, and information about the dataframe, respectively.

In [19]:
df.head()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,...,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,...,,,64.7,62.6,,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00
1,2001_RE_0187,2001-10-28,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_520,LandingSite_CaptureSiteCategory_2,Species_6,W442,...,,,35.85,31.35,,Unknown,multiple b's on front flippers& a lot of alga...,Released,ReleaseSite_62,28/10/01
2,2001_RE_0197,2001-11-01,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_1669,LandingSite_CaptureSiteCategory_2,Species_5,KE0376,...,,,51.8,49.2,,Unknown,clean,Released,ReleaseSite_50,01/11/01
3,2002_RE_0031,2002-03-11,Researcher_32,CaptureSite_0,Ocean,Net,Fisher_1798,LandingSite_CaptureSiteCategory_2,Species_6,CC00302,...,,,60.5,59.0,,Unknown,1 b 3 CS+ calcerous algae at rear end of shell...,Released,ReleaseSite_50,11/03/02
4,2002_RE_0118,2002-08-08,Researcher_25,CaptureSite_0,Ocean,Beached,Fisher_1918,LandingSite_CaptureSiteCategory_2,Species_5,NotTagged_0113,...,,,34.7,33.0,,Unknown,very lively+ right eye is hanging out + swolle...,Released,ReleaseSite_62,08/08/02


In [20]:
df.tail()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,...,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
18057,2018_RE_1511,2018-12-18,Researcher_30,CaptureSite_9,Ocean,Net,Fisher_569,LandingSite_CaptureSiteCategory_1,Species_5,KES1828,...,,,57.13,50.57,21.09,Unknown,White calcareous algae on carapace,Released,ReleaseSite_68,18/12/18
18058,2018_RE_1514,2018-12-18,Researcher_30,CaptureSite_9,Ocean,Net,Fisher_125,LandingSite_CaptureSiteCategory_1,Species_6,KES0563,...,KES0416,,42.07,38.37,9.02,Unknown,Calcareous + green algae on carapace\nBarnacle...,Released,ReleaseSite_68,18/12/18
18059,2018_RE_1532,2018-12-24,Researcher_30,CaptureSite_9,Ocean,Net,Fisher_1343,LandingSite_CaptureSiteCategory_1,Species_5,KES1833,...,,,57.2,52.3,,Unknown,Clean turtle,Released,ReleaseSite_68,24/12/18
18060,2018_RE_1533,2018-12-24,Researcher_30,CaptureSite_9,Ocean,Net,Fisher_1551,LandingSite_CaptureSiteCategory_1,Species_5,KES1831,...,,,51.9,48.5,,Unknown,Green algae on carapace\ntip of left supra mis...,Released,ReleaseSite_68,24/12/18
18061,2018_RE_1550,2018-12-28,Researcher_30,CaptureSite_9,Ocean,Net,Fisher_1551,LandingSite_CaptureSiteCategory_1,Species_6,KES1432,...,,,34.6,31.2,4.29,Unknown,Thick pink patches of calcareous algae on cara...,Released,ReleaseSite_37,28/12/18


In [21]:
df.describe()

Unnamed: 0,CCL_cm,CCW_cm,Weight_Kg
count,18038.0,18035.0,12653.0
mean,43.09039,40.253904,9.850731
std,11.004251,9.933058,9.737378
min,2.0,2.0,0.02
25%,36.33,34.0,5.0
50%,41.3,39.3,7.5
75%,47.0,44.1,10.8
max,122.75,106.0,140.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Rescue_ID              18062 non-null  object 
 1   Date_TimeCaught        18062 non-null  object 
 2   Researcher             18062 non-null  object 
 3   CaptureSite            18062 non-null  object 
 4   ForagingGround         18062 non-null  object 
 5   CaptureMethod          18062 non-null  object 
 6   Fisher                 18062 non-null  object 
 7   LandingSite            18062 non-null  object 
 8   Species                18062 non-null  object 
 9   Tag_1                  18062 non-null  object 
 10  Tag_2                  18062 non-null  object 
 11  Lost_Tags              925 non-null    object 
 12  T_Number               38 non-null     object 
 13  CCL_cm                 18038 non-null  float64
 14  CCW_cm                 18035 non-null  float64
 15  We

In [23]:
df.shape

(18062, 21)

In [25]:
df.dtypes

Rescue_ID                 object
Date_TimeCaught           object
Researcher                object
CaptureSite               object
ForagingGround            object
CaptureMethod             object
Fisher                    object
LandingSite               object
Species                   object
Tag_1                     object
Tag_2                     object
Lost_Tags                 object
T_Number                  object
CCL_cm                   float64
CCW_cm                   float64
Weight_Kg                float64
Sex                       object
TurtleCharacteristics     object
Status                    object
ReleaseSite               object
Date_TimeRelease          object
dtype: object

In [27]:
df.columns

Index(['Rescue_ID', 'Date_TimeCaught', 'Researcher', 'CaptureSite',
       'ForagingGround', 'CaptureMethod', 'Fisher', 'LandingSite', 'Species',
       'Tag_1', 'Tag_2', 'Lost_Tags', 'T_Number', 'CCL_cm', 'CCW_cm',
       'Weight_Kg', 'Sex', 'TurtleCharacteristics', 'Status', 'ReleaseSite',
       'Date_TimeRelease'],
      dtype='object')

# Step 3: Data Preprocessing and Feature Engineering
In this step, we preprocess the data to prepare it for machine learning modeling. We first drop the Date column since it is not relevant to the prediction task. We then encode the categorical variables using one-hot encoding to convert them into numerical features. Finally, we split the data into training and testing sets using the train_test_split() method from scikit-learn.

In [28]:
# Dropping irrelevant columns
data = data.drop(['Date_TimeCaught'], axis=1)

# Encoding categorical variables
data = pd.get_dummies(data, columns=['CaptureSite', 'CaptureMethod', 'Status', 'Species'])

# Splitting the data into training and testing sets
X = data.drop(['Captured_Number'], axis=1)
y = data['Captured_Number']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['Date_TimeCaught'] not found in axis"

# Step 4: Model Training and Evaluation
In this step, we train a machine learning model on the preprocessed data and evaluate its performance using the root mean squared error (RMSE) metric. We use the linear regression algorithm as our model since it is a simple yet effective algorithm for regression tasks. We then use the predict() method to make predictions on the test set and calculate the RMSE using the mean_squared_error() and sqrt() functions from NumPy.

In [None]:
# Training a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)


In [None]:
#Step 4: Model Selection
#In this step, we will select the appropriate machine learning algorithm(s) to build the model and evaluate its performance. We will perform the following tasks:

#Split the data into training and testing sets
#Train different machine learning algorithms such as Linear Regression, Random Forest Regression, and XGBoost Regression on the training data
#Evaluate the performance of the models using the root mean squared error (RMSE) and R-squared (R2) values on the testing data
#Select the best-performing model(s) based on the evaluation metrics

import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

read in the pre-processed data
df = pd.read_csv('preprocessed_data.csv')

split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Captured_Number', axis=1),
df['Captured_Number'],
test_size=0.2,
random_state=42)

train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

train a random forest regression model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

train an XGBoost regression model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgbr.fit(X_train, y_train)

evaluate the performance of the models on the test set
lr_rmse = mean_squared_error(y_test, lr.predict(X_test), squared=False)
lr_r2 = r2_score(y_test, lr.predict(X_test))

rf_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
rf_r2 = r2_score(y_test, rf.predict(X_test))

xgbr_rmse = mean_squared_error(y_test, xgbr.predict(X_test), squared=False)
xgbr_r2 = r2_score(y_test, xgbr.predict(X_test))

print(f'Linear Regression RMSE: {lr_rmse}, R2: {lr_r2}')
print(f'Random Forest RMSE: {rf_rmse}, R2: {rf_r2}')
print(f'XGBoost RMSE: {xgbr_rmse}, R2: {xgbr_r2}')



# Step 5: Making Predictions and Creating Submission File
In this step, we use the trained model to make predictions

In [None]:
#Step 5: Model Optimization
#In this step, we will optimize the parameters of the selected model(s) to improve their performance. We will perform the following tasks:

#Tune the hyperparameters of the selected models using techniques such as grid search and random search
#Evaluate the performance of the optimized models on the test set
#Select the best-performing model(s) based on the evaluation metrics

import necessary libraries
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

tune the hyperparameters of the random forest model using grid search
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 4, 6],
'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestRegressor(random_state=42),
param_grid=param_grid,
cv=5,
n_jobs=-1,
verbose=2)
rf_grid.fit(X_train, y_train)

# Making predictions on the new data
new_data = pd.read_csv('new_data.csv')
new_data = pd.get_dummies(new_data, columns=['CaptureSite', 'RescueMethod', 'Status', 'TagType'])
new_X = new_data.drop(['Captured_Number'], axis=1)
new_y_pred = model.predict(new_X)

# Creating submission file
submission = pd.DataFrame({'ID': new_data['ID'], 'Captured_Number': new_y_pred})
submission.to_csv('submission.csv', index=False)
