# This project involves developing a machine learning model to predict the number of turtles that will be rescued at each rescue site as part of the Local Ocean Conservation's by-catch release program. The following are the starter codes for the project.

# Step 1: Importing the necessary libraries and data
In this step, we import the necessary libraries for the project, including Pandas for data manipulation, NumPy for numerical computations, scikit-learn for machine learning, and Matplotlib for data visualization. We also import the rescue data in CSV format into a Pandas dataframe for analysis.

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Importing the data
data = pd.read_csv('train.csv')
data.head()

ModuleNotFoundError: No module named 'sklearn'

# Step 2: Exploratory Data Analysis
In this step, we explore the data to gain insights into its features and structure. We use the head(), describe(), and info() methods to display the first few rows, statistical summary, and information about the dataframe, respectively.

In [None]:
# Exploring the data
print(data.head())
print(data.describe())
print(data.info())

      Rescue_ID Date_TimeCaught     Researcher    CaptureSite ForagingGround  \
0  2000_RE_0060      2000-12-22  Researcher_25  CaptureSite_0          Ocean   
1  2001_RE_0187      2001-10-28   Researcher_6  CaptureSite_0          Ocean   
2  2001_RE_0197      2001-11-01   Researcher_6  CaptureSite_0          Ocean   
3  2002_RE_0031      2002-03-11  Researcher_32  CaptureSite_0          Ocean   
4  2002_RE_0118      2002-08-08  Researcher_25  CaptureSite_0          Ocean   

  CaptureMethod       Fisher                        LandingSite    Species  \
0           Net  Fisher_1072  LandingSite_CaptureSiteCategory_2  Species_6   
1           Net   Fisher_520  LandingSite_CaptureSiteCategory_2  Species_6   
2           Net  Fisher_1669  LandingSite_CaptureSiteCategory_2  Species_5   
3           Net  Fisher_1798  LandingSite_CaptureSiteCategory_2  Species_6   
4       Beached  Fisher_1918  LandingSite_CaptureSiteCategory_2  Species_5   

            Tag_1  ... Lost_Tags T_Number CCL_cm  

# Step 3: Data Preprocessing and Feature Engineering
In this step, we preprocess the data to prepare it for machine learning modeling. We first drop the Date column since it is not relevant to the prediction task. We then encode the categorical variables using one-hot encoding to convert them into numerical features. Finally, we split the data into training and testing sets using the train_test_split() method from scikit-learn.

In [None]:
# Dropping irrelevant columns
data = data.drop(['Date_TimeCaught'], axis=1)

# Encoding categorical variables
data = pd.get_dummies(data, columns=['CaptureSite', 'CaptureMethod', 'Status', 'Species'])

# Splitting the data into training and testing sets
X = data.drop(['Captured_Number'], axis=1)
y = data['Captured_Number']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['Captured_Number'] not found in axis"

# Step 4: Model Training and Evaluation
In this step, we train a machine learning model on the preprocessed data and evaluate its performance using the root mean squared error (RMSE) metric. We use the linear regression algorithm as our model since it is a simple yet effective algorithm for regression tasks. We then use the predict() method to make predictions on the test set and calculate the RMSE using the mean_squared_error() and sqrt() functions from NumPy.

In [None]:
# Training a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)


In [None]:
#Step 4: Model Selection
#In this step, we will select the appropriate machine learning algorithm(s) to build the model and evaluate its performance. We will perform the following tasks:

#Split the data into training and testing sets
#Train different machine learning algorithms such as Linear Regression, Random Forest Regression, and XGBoost Regression on the training data
#Evaluate the performance of the models using the root mean squared error (RMSE) and R-squared (R2) values on the testing data
#Select the best-performing model(s) based on the evaluation metrics

import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

read in the pre-processed data
df = pd.read_csv('preprocessed_data.csv')

split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Captured_Number', axis=1),
df['Captured_Number'],
test_size=0.2,
random_state=42)

train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

train a random forest regression model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

train an XGBoost regression model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgbr.fit(X_train, y_train)

evaluate the performance of the models on the test set
lr_rmse = mean_squared_error(y_test, lr.predict(X_test), squared=False)
lr_r2 = r2_score(y_test, lr.predict(X_test))

rf_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
rf_r2 = r2_score(y_test, rf.predict(X_test))

xgbr_rmse = mean_squared_error(y_test, xgbr.predict(X_test), squared=False)
xgbr_r2 = r2_score(y_test, xgbr.predict(X_test))

print(f'Linear Regression RMSE: {lr_rmse}, R2: {lr_r2}')
print(f'Random Forest RMSE: {rf_rmse}, R2: {rf_r2}')
print(f'XGBoost RMSE: {xgbr_rmse}, R2: {xgbr_r2}')



# Step 5: Making Predictions and Creating Submission File
In this step, we use the trained model to make predictions

In [None]:
#Step 5: Model Optimization
#In this step, we will optimize the parameters of the selected model(s) to improve their performance. We will perform the following tasks:

#Tune the hyperparameters of the selected models using techniques such as grid search and random search
#Evaluate the performance of the optimized models on the test set
#Select the best-performing model(s) based on the evaluation metrics

import necessary libraries
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

tune the hyperparameters of the random forest model using grid search
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 4, 6],
'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestRegressor(random_state=42),
param_grid=param_grid,
cv=5,
n_jobs=-1,
verbose=2)
rf_grid.fit(X_train, y_train)

# Making predictions on the new data
new_data = pd.read_csv('new_data.csv')
new_data = pd.get_dummies(new_data, columns=['CaptureSite', 'RescueMethod', 'Status', 'TagType'])
new_X = new_data.drop(['Captured_Number'], axis=1)
new_y_pred = model.predict(new_X)

# Creating submission file
submission = pd.DataFrame({'ID': new_data['ID'], 'Captured_Number': new_y_pred})
submission.to_csv('submission.csv', index=False)
