# <u>T-test and predictive logistic regression</u> 

In [None]:
# Importing Python packages and modules
!pip install imgkit
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import rasterio
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import warnings
import IPython.display as disp
import imgkit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from scipy.stats import ttest_ind
from IPython.display import FileLink
from shapely.geometry import Point
from sklearn.model_selection import cross_val_score
from pandas.api.types import CategoricalDtype

#### Importing the data

In [None]:
# Importing the buffer count (see Buffer_count.ipynb)
buffer_metrics = gpd.read_file("buffer_metrics_lat_long.csv")

----------------------------------------------------

# Checking the data

In [None]:
buffer_metrics.sample(5)

In [None]:
# All records are present
buffer_metrics.info()

In [None]:
# Converting relevant columns to numeric, coercing errors to NaN
for col in ['Point ID', 'Building count', 'Road length', 'Bathymetry mean', 'Other points']:
    buffer_metrics[col] = pd.to_numeric(buffer_metrics[col], errors='coerce')

# Check again
print(buffer_metrics.dtypes)

In [None]:
# Checking the data lines up, 5 strandings, same lat long, different sizes and counts, same for the corresponding random point
filtered_df = buffer_metrics[buffer_metrics['Point ID'] == 4451]
filtered_df.head(10)

------------------------------------------------------------------------------------

# Scaling and transforming the data

#### Pair Plot

In [None]:
# Removing the warning output for astetics
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

# Replacing inf with NaN in the DataFrame
buffer_metrics = buffer_metrics.replace([np.inf, -np.inf], np.nan)

# Sort so largest buffers are first, smallest last (drawn last = on top)
order = ['5000m', '3000m', '1500m', '1000m', '500m']
buffer_metrics_sorted = buffer_metrics.set_index('Buffer size').loc[order].reset_index()


# Setting the colours to make it easier to see
custom_palette = {
    '500m':   '#E41A1C',
    '1000m':  '#377EB8',
    '1500m':  '#4DAF4A',
    '3000m':  '#984EA3',
    '5000m':  '#FF7F00'}

#Creating the pairplot
sns.pairplot(
    data=buffer_metrics,
    vars=['Building count', 'Road length', 'Bathymetry mean', 'Other points'],
    hue='Buffer size',
    palette=custom_palette)
#Saving and displaying the plot
plt.savefig("Method_results_images/T_test_Pairplot_Before_Log_Transform.png", dpi=150, bbox_inches="tight")
plt.show()


#### The pairplot shows a strong positive linear relationship between building count and road length across all buffer sizes. Human presence (buildings and roads) is inversely related to bathymetry, with higher human presence associated with shallower coastal zones. Larger buffers (5000 m) dominate with higher values and greater bathymetric variation, likely from covering both land and sea, while smaller buffers cluster at lower values but follow the same trends. The “other points” metric is highly skewed, with most buffers containing few additional points.

#### Log-transform

In [None]:
cols = ['Building count', 'Road length', 'Bathymetry mean', 'Other points']
buffer_metrics[cols] = buffer_metrics[cols].apply(pd.to_numeric, errors='coerce')

In [None]:
#Computing correlations
buffer_metrics[['Building count', 'Road length', 'Bathymetry mean', 'Other points']].corr()

In [None]:
#creating a copy of buffer_metrics
df_log = buffer_metrics.copy()

In [None]:
#Log-transforming building count and road lenght to reduce skewness
df_log['Building_trans'] = np.log1p(df_log['Building count'])
df_log['Road_trans'] = np.log1p(df_log['Road length'])
df_log['Bathymetry_trans'] = np.log1p(df_log['Bathymetry mean'])
df_log['Other_points_trans'] = np.log1p(df_log['Other points'])

#### Dealing with bathymetric null records

In [None]:
df_log.info()

In [None]:
#Checking the missing 480 bathymetry_trans records (the same as the ones in the Buffer_count notebook) 
df_log[df_log['Bathymetry_trans'].isna()]

In [None]:
# Removing warning for aesthetics 
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

# Replacing inf with NaN in the DataFrame
buffer_metrics = buffer_metrics.replace([np.inf, -np.inf], np.nan)

# Sort so largest buffers are first, smallest last (drawn last = on top)
order = ['5000m', '3000m', '1500m', '1000m', '500m']
buffer_metrics_sorted = buffer_metrics.set_index('Buffer size').loc[order].reset_index()

# Setting the colours to make it easier to see the differnt clusters
custom_palette = {
    '500m':   '#E41A1C',
    '1000m':  '#377EB8',
    '1500m':  '#4DAF4A',
    '3000m':  '#984EA3',
    '5000m':  '#FF7F00'}

#Creating the pairplot
sns.pairplot(
    data=df_log,
    vars=['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans'],
    hue='Buffer size',
    palette=custom_palette)
#Saving and displaying the plot
plt.savefig("Method_results_images/T_test_Pairplot_After_Log_Transform.png", dpi=150, bbox_inches="tight")
plt.show()



#### Pairplot shows reduction in skewness, improved comparability across buffer sizes, more linear relationships, equalised influence of predictors and clearer density patterns.

-----------------------------------

# Scale Features

In [None]:
# Converting the ndarray back to a pandas DataFrame
df_log = pd.DataFrame(df_log, columns=['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans'])

# Scaling the features with RobustScaler
features_to_scale = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']
scaler = RobustScaler()
df_log[features_to_scale] = scaler.fit_transform(df_log[features_to_scale])

In [None]:
df_log.head()

In [None]:
# Merging the log-transformed and scaled columns back into the original DataFrame
cols_to_add = ['Data', 'Buffer size', 'Buffer geometry', 'Point ID', 'Other points', 'Road length', 'Building count', 'Bathymetry mean', 'latitude','longitude']

df_log = pd.concat(
    [
        buffer_metrics[cols_to_add].reset_index(drop=True),
        df_log.reset_index(drop=True)],
    axis=1)

In [None]:
#Checking the new df
df_log.sample(10)

In [None]:
# All rows intact (minus the 480 from bathymetry columns)
df_log.shape
df_log.info()

In [None]:
df_log.describe()

In [None]:
#creating a new df 
df_log = df_log.copy()

# Saving to CSV
df_log.to_csv('df_log.csv', index=False)

# Display download link
FileLink('df_log.csv')

-------------------

In [None]:
# I added the latitude and longtitude for the points back into the DataFrame, and removed the larger 3000 and 5000 sized buffers
df = gpd.read_file("df_log.csv")

#### Creating a GeoDataFrame

In [None]:
# Making sure the Latitude and Longitude are numeric
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

# Creating a Point geometry column
df["Point geometry"] = df.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)

# Making the GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry="Point geometry", crs="EPSG:4326")

# Preview
gdf.head()

In [None]:
gdf.info()

In [None]:
print("CRS:", gdf.crs)

#### Descriptive statistics and info for each buffer size, for both random and strandings, to get a feel for the breakdown of each.

In [None]:
# Cleaning the buffer size column and enforce order
gdf['Buffer size'] = gdf['Buffer size'].str.replace('m', '').astype(int)
buffer_order = [500, 1000, 1500, 3000, 5000]
gdf['Buffer size'] = pd.Categorical(gdf['Buffer size'], categories=buffer_order, ordered=True)

# Metrics to describe
metrics = ['Other points', 'Road length', 'Building count', 'Bathymetry mean']
for col in metrics:
    gdf[col] = pd.to_numeric(gdf[col], errors='coerce')
gdf = gdf.dropna(subset=metrics)

# Generating .describe() per buffer size and data type
for buffer_size in buffer_order:
    for data_type in ['Strandings points', 'Random points']:
        subset = gdf[(gdf['Buffer size'] == buffer_size) & (gdf['Data'] == data_type)]
        print(f"\n{'='*60}")
        print(f"Descriptive Stats for {data_type} at {buffer_size}m")
        print(subset[metrics].describe())

In [None]:
# Ensuring 'Buffer size' is treated as integer
if gdf['Buffer size'].dtype == 'object':
    gdf['Buffer size'] = gdf['Buffer size'].str.replace('m', '').astype(int)

# Defining hte buffer order 
buffer_order = [500, 1000, 1500, 3000, 5000]
gdf['Buffer size'] = pd.Categorical(gdf['Buffer size'], categories=buffer_order, ordered=True)

# Looping and printing .info()
for buffer_size in buffer_order:
    for data_type in ['Strandings points', 'Random points']:
        subset = gdf[(gdf['Buffer size'] == buffer_size) & (gdf['Data'] == data_type)]
        print("\n" + "="*60)
        print(f".info() for {data_type} at {buffer_size}m")
        print("="*60)
        subset.info()

# Strandings points vs random points

In [None]:
# Removing warnings for aesthetics 
warnings.filterwarnings(
    "ignore",
    message=".*observed=False is deprecated.*")
warnings.filterwarnings(
    "ignore",
    message=".*use_inf_as_na option is deprecated.*")



# Setting the buffer order
buffer_order = [500, 1000, 1500, 3000, 5000] 
gdf['Buffer size'] = pd.Categorical(gdf['Buffer size'], categories=buffer_order, ordered=True)
gdf['Buffer size (m)'] = gdf['Buffer size'].astype(str) + 'm'

# Ensuring metric columns are numeric
metrics = ['Other points', 'Road length', 'Building count', 'Bathymetry mean']
for col in metrics:
    gdf[col] = pd.to_numeric(gdf[col], errors='coerce')

gdf.dropna(subset=metrics, inplace=True)

# Setting the titles
titles = [
    'Other Points: Strandings vs Random',
    'Road Length: Strandings vs Random',
    'Building Count: Strandings vs Random',
    'Bathymetry Mean: Strandings vs Random'
]

# Plotting the barcharts
sns.set(style="whitegrid")
palette = {'Strandings points': 'steelblue', 'Random points': 'darkorange'}
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    grouped = gdf.groupby(['Buffer size (m)', 'Data'])[metric].sum().reset_index()
    # Sort by buffer order for plotting
    grouped['Buffer size (m)'] = pd.Categorical(
        grouped['Buffer size (m)'],
        categories=[f"{b}m" for b in buffer_order],
        ordered=True
    )
    sns.barplot(
        data=grouped,
        x='Buffer size (m)', y=metric, hue='Data',
        palette=palette, ax=ax
    )
    ax.set_title(titles[i])
    ax.set_xlabel("Buffer Size")
    ax.set_ylabel(metric)
    ax.legend(title='Data Type')
    
plt.tight_layout()    
plt.savefig("Method_results_images/T_test_Barcharts_Random_vs_Strandings.png", dpi=150, bbox_inches="tight")
#Visulising 
plt.show()

#### Across all buffer sizes, strandings points consistently show higher counts of other points, road length, and building count compared to random points, with differences becoming more pronounced as buffer size increases. For bathymetry random points are more often located at deeper waters, which fits with our understanding that strandings occur in shallower waters. 

# Welch's t-tests
#### To see if strandings and random points differ significantly

In [None]:
print(df_log.columns)

#### For each buffer size and metric, I compared strandings and random locations with Welch’s two-sample t-test (unequal variances), which is robust to differences in variance and sample size between groups.

In [None]:
# Defining the relevant columns to test
metrics = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']
buffer_sizes = df_log['Buffer size'].unique()

# Storing the results
results = []
# Looping over the buffers for strandings and random, skipping any missing data
for buffer in buffer_sizes:
    for metric in metrics:
        strandings_df = df_log[
            (df_log['Data'] == 'Strandings points') &
            (df_log['Buffer size'] == buffer)]

        randoms_df = df_log[
            (df_log['Data'] == 'Random points') &
            (df_log['Buffer size'] == buffer)]


        if strandings_df.empty or randoms_df.empty:
            print(f"Skipping buffer {buffer}, metric {metric} (no data)")
            continue

        strandings = strandings_df[metric].dropna()
        randoms = randoms_df[metric].dropna()

        if strandings.empty or randoms.empty:
            print(f"Skipping buffer {buffer}, metric {metric} (no non-NA data)")
            continue
# Running Welch's test, and collecting the results
        t_stat, p_val = ttest_ind(strandings, randoms, equal_var=False)

        results.append({
            'Buffer size': buffer,
            'Metric': metric,
            't-statistic': round(t_stat, 3),
            'p-value': round(p_val, 3)})


# Creating a results DataFrame
t_test_df = pd.DataFrame(results)
print(t_test_df)

#os.makedirs("Method_results_images", exist_ok=True)
#Saving the table for the write up
t_test_df_sorted = t_test_df.sort_values(['Metric','Buffer size']).reset_index(drop=True)
t_test_df_sorted.to_html(
    "Method_results_images/T_tests_Results_By_Buffer.html",
    index=False)

#### Key points: Strandings locations appear more “human-present” (more nearby points, roads, buildings) and in shallower water than random locations, with the anthropogenic contrasts strongest at smaller spatial scales.

--------------------------------

In [None]:
#creating a new df
t_test_df = t_test_df.copy()

# Saving to CSV
t_test_df.to_csv('t_test_df.csv', index=False)

# Display download link
FileLink('t_test_df.csv')

---------

# Cohen's d 
#### Carrying out cohen's d statistical analysis to quantify the difference.

In [None]:
# Defining the relevant columns to test
metrics = ['Other points', 'Road length', 'Building count', 'Bathymetry mean']
buffer_sizes = buffer_metrics['Buffer size'].unique()
# Storing the results
results = []
# Looping over the buffers for strandings and random, skipping any missing data
for buffer in buffer_sizes:
    for metric in metrics:
        # Subset data for strandings and random points
        strandings = buffer_metrics[
            (buffer_metrics['Data'] == 'Strandings points') &
            (buffer_metrics['Buffer size'] == buffer)
        ][metric].dropna()

        randoms = buffer_metrics[
            (buffer_metrics['Data'] == 'Random points') &
            (buffer_metrics['Buffer size'] == buffer)
        ][metric].dropna()

        # Running Welch's t-test
        t_stat, p_val = ttest_ind(strandings, randoms, equal_var=False)

        # Testing the means and stds for effect size
        mean1 = strandings.mean()
        mean2 = randoms.mean()
        std1 = strandings.std()
        std2 = randoms.std()
        n1 = len(strandings)
        n2 = len(randoms)

        # Pooling standard deviation
        pooled_std = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2) / (n1 + n2 - 2))

        # Cohen's d
        if pooled_std > 0:
            cohens_d = (mean1 - mean2) / pooled_std
        else:
            cohens_d = np.nan
        # Collecting results
        results.append({
            'Buffer size': buffer,
            'Metric': metric,
            'Strandings mean': round(mean1, 3),
            'Randoms mean': round(mean2, 3),
            't-statistic': round(t_stat, 3),
            'p-value': round(p_val, 3),
            'Cohen d': round(cohens_d, 3)})

# Creating a DataFrame of results
cohens_d_df = pd.DataFrame(results)

# Showing results
print(cohens_d_df)

#os.makedirs("Method_results_images", exist_ok=True)
#Saving the results for write up
cohens_d_df_sorted = cohens_d_df.sort_values(['Metric','Buffer size']).reset_index(drop=True)
cohens_d_df_sorted.to_html(
    "Method_results_images/T_tests_Cohens_D_Results_By_Buffer.html",
    index=False)


#### Across all buffer sizes, strandings occur in environments with more nearby points, longer road length, and higher building counts than random locations, while being associated with shallower bathymetry.

------------------

# Visualising the results

In [None]:
# Making sure seaborn style is set
sns.set(style="whitegrid")

# Listing the metrics to plot
metrics = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']

# Looping over those metrics
for metric in metrics:
    g = sns.catplot(
        x="Data",
        y=metric,
        col="Buffer size",
        data=df_log,
        kind="box",
        col_wrap=3,
        height=4,
        palette="pastel",
        sharey=False)
#Formatting the boxplots    
    g.fig.subplots_adjust(top=0.85)
    g.fig.suptitle(f"{metric} by Data Type and Buffer Size", fontsize=16)
    g.set_axis_labels("Point Type", metric)
# Saving and displaying the boxplots
    plt.savefig("Method_results_images/T_test_Boxplots_Best_Buffer.png", dpi=150, bbox_inches="tight")
    plt.show()

------------------

# Logistic Regression Classification

#### Predicting if a point is a random point or a stranding point to assess predictive performance 

In [None]:
#Preparing the data for analysis
df_model = df_log.copy()
df_model['is_stranding'] = df_model['Data'].apply(lambda x: 1 if x == 'Strandings points' else 0)
#Defining the features
features = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']
df_model = df_model.dropna(subset=features + ['is_stranding'])
# Defining inputs and target
X = df_model[features]
y = df_model['is_stranding']
# Train/test split (with stratification to keep the class balance the same in train and test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42)


# 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
base_lr = LogisticRegression(max_iter=1000)

cv_scores = cross_validate(
    base_lr, X_train, y_train,
    cv=cv,
    scoring=['accuracy', 'roc_auc'],
    return_train_score=False)
# Printing ± std for cross-validation accuracy and ROC AUC to show average performance and variability
print(f"CV Accuracy: {cv_scores['test_accuracy'].mean():.3f} ± {cv_scores['test_accuracy'].std():.3f}")
print(f"CV ROC AUC: {cv_scores['test_roc_auc'].mean():.3f} ± {cv_scores['test_roc_auc'].std():.3f}")


# Small hyperparameter search for C using ROC AUC
param_grid = {'C': np.logspace(-3, 3, 7)}  # 0.001 ... 1000
grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
#Printing the results
print(f"Best C from CV: {grid.best_params_['C']}  |  Best CV ROC AUC: {grid.best_score_:.3f}")

# Fitting the best model on the full training set, evaluate on test
best_model.fit(X_train, y_train)
# Class prediction and predicted probabilities for teh positive class
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]
# Displaying the results
print("Test Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Test ROC AUC:", round(roc_auc_score(y_test, y_prob), 3))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Creating a confusion matrix for the write up
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=["Random", "Stranding"], yticklabels=["Random", "Stranding"])
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.title("Confusion Matrix (Logistic Regression)")
os.makedirs("Method_results_images", exist_ok=True)
plt.savefig("Method_results_images/T_test_Confusion_Matrix_LogReg_CV.png", dpi=150, bbox_inches="tight")
plt.show()


#### Results suggests logistic regression is a moderately good at seperating classes but isn't highly predictive (Accuracy ~65%, ROC AUC ~0.71). It distinguishes between random and stranding points better than chance, but there’s substantial overlap between the classes, leading to balanced but modest precision/recall for both.

## Random Forest

#### To compare a linear and a non-linear approach, we trained a Random Forest classifier on the same feature set and evaluated it both on the hold-out test set and with 5-fold stratified cross-validation.

In [None]:
#Preparing the data for analysis
df_model = df_log.copy()
df_model['is_stranding'] = df_model['Data'].apply(lambda x: 1 if x == 'Strandings points' else 0)
#Defining the features
features = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']
df_model = df_model.dropna(subset=features + ['is_stranding'])
# Defining inputs and target
X = df_model[features]
y = df_model['is_stranding']
# Train/test split (with stratification to keep the class balance the same in train and test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42)

# 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
base_rf = RandomForestClassifier(
    n_estimators=300, random_state=42, n_jobs=-1)

cv_scores = cross_validate(
    base_rf, X_train, y_train,
    cv=cv,
    scoring=['accuracy', 'roc_auc'],
    return_train_score=False,
    n_jobs=-1)
# Printing ± std for cross-validation accuracy and ROC AUC to show average performance and variability
print(f"RF CV Accuracy: {cv_scores['test_accuracy'].mean():.3f} ± {cv_scores['test_accuracy'].std():.3f}")
print(f"RF CV ROC AUC: {cv_scores['test_roc_auc'].mean():.3f} ± {cv_scores['test_roc_auc'].std():.3f}")


#Small hyperparameter search using ROC AUC
param_grid = {
    'n_estimators': [300, 600],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 0.5]}  # 'sqrt' or 50% of features}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_
#Printing results
print(f"Best RF params: {grid.best_params_}")
print(f"Best CV ROC AUC: {grid.best_score_:.3f}")

# Fitting the best Random Forest on training set; evaluating on the test set
best_rf.fit(X_train, y_train)
# Class prediction and predicted probabilities for teh positive class
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]
# Displaying the results
print("Test Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Test ROC AUC:", round(roc_auc_score(y_test, y_prob), 3))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Creating a confusion matrix for the write up
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=["Random", "Stranding"], yticklabels=["Random", "Stranding"])
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.title("Confusion Matrix (Random Forest)")
os.makedirs("Method_results_images", exist_ok=True)
plt.savefig("Method_results_images/T_test_Confusion_Matrix_Random_Forest_CV.png", dpi=150, bbox_inches="tight")
plt.show()


#### The Random Forest model significantly outperforms Logistic Regression (Accuracy: 80% vs. 65%, ROC AUC: 0.885 vs. 0.707). It achieves balanced precision/recall for both classes and greatly reduces misclassifications, showing it captures the relationship between predictors and strandings far better than the linear model.

In [None]:
# Ensure output folder exists
os.makedirs("Method_results_images", exist_ok=True)

features = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans']
buffer_sizes = ['500m', '1000m', '1500m', '3000m', '5000m']

results_list = []

for buffer in buffer_sizes:
    # Subset and clean
    subset = df_log[df_log['Buffer size'] == buffer].dropna(subset=features + ['Data']).copy()
    subset['is_stranding'] = (subset['Data'] == 'Strandings points').astype(int)

    X = subset[features]
    y = subset['is_stranding']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    # --- Logistic Regression ---
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    y_prob = logreg.predict_proba(X_test)[:, 1]

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    logreg_cv_acc = cross_val_score(logreg, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    logreg_cv_auc = cross_val_score(logreg, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)

    results_list.append({
        "Buffer size (m)": buffer.replace("m",""),
        "Model": "Logistic Regression",
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "ROC AUC": round(roc_auc_score(y_test, y_prob), 3),
        "CV Accuracy": f"{logreg_cv_acc.mean():.3f} ± {logreg_cv_acc.std():.3f}",
        "CV ROC AUC": f"{logreg_cv_auc.mean():.3f} ± {logreg_cv_auc.std():.3f}"
    })

    # --- Random Forest ---
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]

    acc_cv = cross_val_score(rf, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    auc_cv = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)

    results_list.append({
        "Buffer size (m)": buffer.replace("m",""),
        "Model": "Random Forest",
        "Accuracy": round(accuracy_score(y_test, y_pred_rf), 3),
        "ROC AUC": round(roc_auc_score(y_test, y_prob_rf), 3),
        "CV Accuracy": f"{acc_cv.mean():.3f} ± {acc_cv.std():.3f}",
        "CV ROC AUC": f"{auc_cv.mean():.3f} ± {auc_cv.std():.3f}"
    })

# Convert to DataFrame
results_df = pd.DataFrame(results_list)

# Save as interactive HTML
html_path = "Method_results_images/T_test_LogReg_vs_RanFor_By_Buffer.html"
results_df.to_html(html_path, index=False, justify="center")

# Show nice table
import IPython.display as disp
disp.display(results_df)



In [None]:
# Create performance data
buffer_sizes = ['500m', '1000m', '1500m', '3000m', '5000m']

# Logistic Regression results
logreg_accuracy = [0.754, 0.741, 0.726, 0.706, 0.694]
logreg_roc_auc = [0.824, 0.808, 0.798, 0.770, 0.745]
logreg_cv_accuracy = [0.753, 0.746, 0.734, 0.713, 0.696]
logreg_cv_roc_auc = [0.823, 0.813, 0.803, 0.775, 0.750]

# Random Forest results
rf_accuracy = [0.779, 0.811, 0.828, 0.870, 0.903]
rf_roc_auc = [0.860, 0.891, 0.909, 0.942, 0.966]
rf_cv_accuracy = [0.782, 0.817, 0.836, 0.877, 0.908]
rf_cv_roc_auc = [0.864, 0.897, 0.916, 0.950, 0.970]

# Assemble into DataFrames for plotting
df_logreg = pd.DataFrame({
    'Buffer Size': buffer_sizes,
    'Accuracy': logreg_accuracy,
    'ROC AUC': logreg_roc_auc,
    'CV Accuracy': logreg_cv_accuracy,
    'CV ROC AUC': logreg_cv_roc_auc})

df_rf = pd.DataFrame({
    'Buffer Size': buffer_sizes,
    'Accuracy': rf_accuracy,
    'ROC AUC': rf_roc_auc,
    'CV Accuracy': rf_cv_accuracy,
    'CV ROC AUC': rf_cv_roc_auc})


In [None]:
# Creating visulisation
plt.figure(figsize=(12, 6))

# Accuracy comparison
plt.subplot(1, 2, 1)
plt.plot(df_logreg['Buffer Size'], df_logreg['Accuracy'], marker='o', label='LogReg Accuracy')
plt.plot(df_logreg['Buffer Size'], df_logreg['CV Accuracy'], marker='o', linestyle='--', label='logreg CV Accuracy')
plt.plot(df_rf['Buffer Size'], df_rf['Accuracy'], marker='o', label='RF Accuracy')
plt.plot(df_rf['Buffer Size'], df_rf['CV Accuracy'], marker='o', linestyle='--', label='RF CV Accuracy')
plt.title("Model Accuracy vs Buffer Size")
plt.xlabel("Buffer Size")
plt.ylabel("Accuracy")
plt.ylim(0.6, 1.0)
plt.grid(True)
plt.legend()

# ROC AUC comparison
plt.subplot(1, 2, 2)
plt.plot(df_logreg['Buffer Size'], df_logreg['ROC AUC'], marker='o', label='LogReg ROC AUC')
plt.plot(df_logreg['Buffer Size'], df_logreg['CV ROC AUC'], marker='o', linestyle='--', label='logreg CV ROC AUC')
plt.plot(df_rf['Buffer Size'], df_rf['ROC AUC'], marker='o', label='RF ROC AUC')
plt.plot(df_rf['Buffer Size'], df_rf['CV ROC AUC'], marker='o', linestyle='--', label='RF CV ROC AUC')
plt.title("Model ROC AUC vs Buffer Size")
plt.xlabel("Buffer Size")
plt.ylabel("ROC AUC")
plt.ylim(0.6, 1.0)
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.savefig("Method_results_images/T_test_linegraph_Model_Accuracy_ROCAUC.png", dpi=150, bbox_inches="tight")
plt.show()

#### These plots show how prediction performance changes with buffer size. Logistic Regression performs worse as buffers get larger, with both accuracy and ROC AUC dropping, while Random Forest improves steadily, reaching very high accuracy and discrimination (ROC AUC) at larger buffers. This suggests Random Forest captures complex, non-linear relationships that Logistic Regression misses.

## Mapping which points were predicted correctly for the larger 5000m buffer with Random Forest

In [None]:
#seeing buffer size and features
buffer = "5000m"
features = ['Building_trans','Road_trans','Bathymetry_trans','Other_points_trans']

# Subset + label
subset = (df_log[df_log['Buffer size'] == buffer]
          .dropna(subset=features + ['Data'])
          .copy())
subset['is_stranding'] = (subset['Data'] == 'Strandings points').astype(int)

X = subset[features]
y = subset['is_stranding']

# Running Split + fit for Random Forest
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42)

rf = RandomForestClassifier(
    n_estimators=600,
    max_features='sqrt',
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on test
y_pred = rf.predict(X_test)

# Creating a results DataFrame 
test_idx = X_test.index
results = subset.loc[test_idx].copy()
results['y_true'] = y_test
results['y_pred'] = y_pred
results['correct'] = (results['y_true'] == results['y_pred'])

# Coordinates 
if 'geometry' not in results.columns and 'geometry' in gdf.columns:
    results = results.merge(
        gdf[['Point ID','Buffer size','geometry']],
        on=['Point ID','Buffer size'], how='left')

if 'geometry' in results.columns:
    results['latitude']  = results.get('latitude',  results.geometry.y)
    results['longitude'] = results.get('longitude', results.geometry.x)

for col in ['latitude','longitude']:
    if col in results.columns:
        results[col] = (results[col].astype(str).str.strip().str.replace(',','', regex=False))
        results[col] = pd.to_numeric(results[col], errors='coerce')

results = results.dropna(subset=['latitude','longitude'])

# Map
map_center = [results['latitude'].mean(), results['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=5, tiles="CartoDB positron")

# building the oucomes
def outcome(r):
    if r['y_true']==1 and r['y_pred']==1: return "Stranding→Stranding"
    if r['y_true']==0 and r['y_pred']==1: return "Random→Stranding"
    if r['y_true']==1 and r['y_pred']==0: return "Stranding→Random"
    return "Random→Random"

results['outcome'] = results.apply(outcome, axis=1)

layers = {
    "Stranding→Stranding" : ("#1b9e77", True),
    "Random→Stranding"    : ("#d95f02", True),
    "Stranding→Random"    : ("#7570b3", True),
    "Random→Random"       : ("#66a61e", False)}

fgs = {name: folium.FeatureGroup(name=name, show=show) for name, (_, show) in layers.items()}

for _, r in results.iterrows():
    color = layers[r['outcome']][0]
    folium.CircleMarker(
        [float(r['latitude']), float(r['longitude'])],
        radius=4, color=color, fill=True, fill_opacity=0.7,
        popup=f"{r['outcome']} | True: {int(r['y_true'])}, Pred: {int(r['y_pred'])}"
    ).add_to(fgs[r['outcome']])

for fg in fgs.values():
    fg.add_to(m)
# Adding clickable layer
folium.LayerControl(collapsed=False).add_to(m)
# Adding legend
counts = results['outcome'].value_counts()
legend_html = f"""
<div style="
  position: fixed; bottom: 20px; left: 20px; z-index: 9999;
  background: white; padding: 10px 12px; border: 1px solid #ccc;
  border-radius: 6px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);
  font-size: 13px;">
  <div style="font-weight:600; margin-bottom:6px;">Prediction outcome (Random Forest)</div>
  {''.join(
    f'<div style="display:flex; align-items:center; margin:3px 0;">'
    f'<span style="display:inline-block; width:14px; height:14px; '
    f'background:{layers[name][0]}; margin-right:6px; border:1px solid #888;"></span>'
    f'{name} (n={int(counts.get(name,0))})</div>'
    for name in layers.keys()
  )}
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))
#Displaying the map
m


In [None]:
#Saving the map
m.save("Method_results_images/T_test_Random_Forest_Predictions_Outcome.html")

#### Saved as a link with Netlify [Random Forest predictions outcome link](https://random-forest-predictions.netlify.app/)

# Continues in 3.Kmeans notebook