In [32]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, KFold
import joblib

In [3]:
# Set random seed for reproducibility
seed = 0
np.random.seed(seed)

In [4]:
def average_time_str(t1, t2):
    """
    Averages two time strings in the format '%H:%M:%S.%f' by computing the midpoint.
    """
    dt1 = datetime.strptime(t1.strip(), "%H:%M:%S.%f")
    dt2 = datetime.strptime(t2.strip(), "%H:%M:%S.%f")
    
    # Convert times to total seconds since midnight
    seconds1 = dt1.hour * 3600 + dt1.minute * 60 + dt1.second + dt1.microsecond / 1e6
    seconds2 = dt2.hour * 3600 + dt2.minute * 60 + dt2.second + dt2.microsecond / 1e6
    
    # Compute the average seconds
    avg_seconds = (seconds1 + seconds2) / 2.0
    
    # Convert average seconds back into hours, minutes, seconds, microseconds
    hours = int(avg_seconds // 3600)
    remainder = avg_seconds - hours * 3600
    minutes = int(remainder // 60)
    seconds = remainder - minutes * 60
    sec_int = int(seconds)
    microsec = int(round((seconds - sec_int) * 1e6))
    
    new_dt = datetime(1900, 1, 1, hour=hours, minute=minutes, second=sec_int, microsecond=microsec)
    return new_dt.strftime("%H:%M:%S.%f")

In [5]:
def generate_synthetic_sample_for_group(group, numeric_cols, noise_std = 1):
    """
    Generates one synthetic sample from the provided group:
      - Randomly selects two rows from the group.
      - Averages numeric columns and adds Gaussian noise.
      - Averages the 'Ping_time' column (computes midpoint).
      - Retains other non-numeric columns from the first sampled row.
    Returns the synthetic sample as a dictionary.
    """
    # Randomly select 2 rows (the global seed ensures reproducibility)
    sampled = group.sample(n=2, random_state=seed)
    
    # Average numeric columns and add Gaussian noise
    avg_numeric = sampled[numeric_cols].mean()
    noise = np.random.normal(loc=0.0, scale=noise_std, size=avg_numeric.shape)
    avg_numeric_noisy = avg_numeric + noise
    
    synthetic_sample = {}
    for col in group.columns:
        if col in numeric_cols:
            synthetic_sample[col] = avg_numeric_noisy[col]
        elif col == 'Ping_time':
            t1 = sampled.iloc[0]['Ping_time']
            t2 = sampled.iloc[1]['Ping_time']
            synthetic_sample[col] = average_time_str(t1, t2)
        else:
            # Retain non-numeric columns from the first sampled row
            synthetic_sample[col] = sampled.iloc[0][col]
            
    # synthetic_sample['isSynthetic'] = True
    return synthetic_sample

def generate_synthetic_samples(df, target_class, num_samples, noise_std = 1):
    """
    Generates a specified number of synthetic samples for the target class.
      - Filters the dataframe for the target class.
      - Groups data by 'fishNum' (only groups with at least 2 records are used).
      - Randomly picks a group (with replacement) and generates a synthetic sample.
    Returns a DataFrame with the synthetic samples.
    """
    df_class = df[df['Spe'] == target_class]
    # Keep groups that have at least 2 records
    groups = [group for _, group in df_class.groupby('fishNum') if len(group) >= 2]
    
    if not groups:
        print(f"No groups with at least 2 records found for class {target_class}.")
        return pd.DataFrame()
    
    # Identify numeric columns (used for averaging)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    synthetic_samples = []
    for _ in range(num_samples):
        # Randomly select one group (with replacement)
        group = groups[np.random.randint(len(groups))]
        synthetic_sample = generate_synthetic_sample_for_group(group, numeric_cols, noise_std=noise_std)
        synthetic_samples.append(synthetic_sample)
    
    return pd.DataFrame(synthetic_samples)


In [6]:
# Load the dataset
data_path = "ProcessedData/AllFishCombined_filtered.csv"
df = pd.read_csv(data_path, low_memory=False)

# Reorder the columns for a better overview

new_order = ['fishNum', 'Spe', 'Index'] + [col for col in df.columns if col not in ['fishNum', 'Spe', 'Index']]
df = df[new_order]

# Filter to keep only the classes "LT" (major) and "SMB" (minor)
df = df[df['Spe'].isin(['LT', 'SMB'])]

# Exclude fishNum LT008 and LT016 due to missing values
df = df[~df['fishNum'].isin(['LT008', 'LT016'])]

df.head()

Unnamed: 0,fishNum,Spe,Index,totalLength,weight,sex,airbladderTotalLength,Ping_time,F45,F45.5,...,F255.5,F256,F256.5,F257,F257.5,F258,F258.5,F259,F259.5,F260
288,LT001,LT,1,454,930,2,75,20:57:29.4800,-36.179106,-36.156327,...,-22.723915,-22.488092,-22.628031,-22.622146,-22.040287,-20.768744,-19.238734,-17.748017,-17.375191,-17.471192
289,LT001,LT,1,454,930,2,75,20:57:29.6040,-36.436682,-36.318622,...,-14.608002,-13.226696,-12.105007,-11.121532,-10.518247,-10.105918,-9.615784,-8.77049,-8.107607,-7.145183
290,LT001,LT,1,454,930,2,75,20:57:29.7290,-36.143429,-35.675405,...,-23.780087,-21.908773,-19.930888,-18.070377,-16.854067,-16.172496,-15.871727,-15.763491,-15.611684,-13.941423
291,LT001,LT,1,454,930,2,75,20:57:29.8560,-34.455303,-34.397209,...,-36.139492,-34.982741,-34.258071,-34.247907,-35.577816,-38.674431,-45.200419,-45.788053,-37.204571,-32.093674
292,LT001,LT,1,454,930,2,75,20:57:29.9910,-40.787544,-41.273102,...,-26.510093,-27.092194,-27.751359,-27.619687,-26.436497,-24.837284,-23.678143,-22.888074,-22.87188,-22.493051


In [7]:
# Total count for each Spe
spe_counts = df['Spe'].value_counts().reset_index()
spe_counts.columns = ['Spe', 'total_count']
print("Total count for each Spe:")
print(spe_counts)

Total count for each Spe:
   Spe  total_count
0   LT        29321
1  SMB        16148


In [8]:
# Total count the number of rows per group
overview_df = df.groupby('fishNum').size().reset_index(name='row_count')
print(overview_df)

   fishNum  row_count
0    LT001        485
1    LT002       1502
2    LT003       3983
3    LT004       1066
4    LT005        585
5    LT006        320
6    LT007        729
7    LT009       2977
8    LT010       1932
9    LT011       1878
10   LT012       1534
11   LT013       2540
12   LT014       3886
13   LT015       3182
14   LT017       1040
15   LT018        124
16   LT019        609
17   LT020        585
18   LT021        364
19  SMB001       4097
20  SMB002       2004
21  SMB003       1034
22  SMB004       1804
23  SMB005       1014
24  SMB006       3912
25  SMB007        636
26  SMB008        739
27  SMB009         31
28  SMB010        159
29  SMB011         95
30  SMB012        183
31  SMB013        440


In [9]:
samples_per_class = {
    'LT': 10679,   # synthetic samples for class 'LT'
    'SMB': 23852   # synthetic samples for class 'SMB'
}
# After resample, there are 40000 rows per class

synthetic_samples_all = []
for target_class, num_samples in samples_per_class.items():
    synthetic_df = generate_synthetic_samples(df, target_class, num_samples, noise_std=0.01)
    synthetic_samples_all.append(synthetic_df)

# Combine synthetic samples from all classes
all_synthetic_df = pd.concat(synthetic_samples_all, ignore_index=True)

# Combine synthetic samples with the original data to create an augmented dataset
augmented_df = pd.concat([df, all_synthetic_df], ignore_index=True)

### Data Preparation for ML-Classification

In [10]:
# Define columns to drop from predictors
drop_cols = ['fishNum', 'Spe', 'Index', 'Ping_time']
# Create the feature matrix (drop non-predictive columns)
X = augmented_df.drop(columns=drop_cols, errors='ignore')
# The target variable is the Spe column
y = augmented_df['Spe']

In [11]:
# Define 5-Fold Cross-Validation with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

### Random Forest Approach

In [12]:
# Initialize the RandomForestClassifier with a fixed random state
rf = RandomForestClassifier(n_estimators=100, random_state=seed)

In [13]:
# Compute the Cross-Validation Accuracy Scores:
cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='accuracy')

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

Cross-Validation Accuracy Scores: [0.9984375 0.9980625 0.9978125 0.9983125 0.9981875]
Mean Cross-Validation Accuracy: 0.9981625


In [14]:
# Fit the model on the entire dataset
rf.fit(X, y)

In [15]:
# Retrieve the feature importances from the model
feature_importances = rf.feature_importances_
features = X.columns
# Create a DataFrame for a more readable view of feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nTop 20 Features based on Random Forest Decision Importances:")
print(importance_df.head(20))


Top 20 Features based on Random Forest Decision Importances:
                   Feature  Importance
3    airbladderTotalLength    0.241597
0              totalLength    0.159579
1                   weight    0.053623
2                      sex    0.035260
6                      F46    0.017738
5                    F45.5    0.011919
4                      F45    0.010455
235                 F243.5    0.008742
187                 F219.5    0.008533
243                 F247.5    0.008021
188                   F220    0.007453
227                 F239.5    0.007332
189                 F220.5    0.007125
64                     F75    0.006668
267                 F259.5    0.006177
190                   F221    0.005702
93                   F89.5    0.005693
228                   F240    0.005417
234                   F243    0.005295
42                     F64    0.005180


### Multilayer Perceptron (MLP) Approach

In [16]:
# Scale features; MLPs typically perform better with standardized data.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [17]:
# Define a 3-Layer MLP Model, with sizes 64, 32, and 16 respectively.
mlp = MLPClassifier(hidden_layer_sizes=(64, 32, 16), random_state = seed, max_iter=200)

In [18]:
# 5-Fold Cross-Validation for Accuracy
cv_scores = cross_val_score(mlp, X_scaled, y, cv=kf, scoring='accuracy')

print("5-Fold Cross-Validation Accuracy Scores (MLP):")
print(cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

5-Fold Cross-Validation Accuracy Scores (MLP):
[0.9995    0.9994375 0.9995    0.9989375 0.9989375]
Mean Accuracy: 0.9992625


In [19]:
# Train the Final Model on the Entire Dataset
mlp.fit(X_scaled, y)

In [20]:
# Estimate Feature Importance from the MLP
input_weights = mlp.coefs_[0]  # Shape: (n_features, number of neurons in first hidden layer)
feature_importance = np.abs(input_weights).sum(axis=1)  # Feature importances from MLP is provided via the absolute sum of the weights from the input layer

# Create a DataFrame for easier viewing
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print("\nTop 20 Features based on MLP Input Weights:")
print(importance_df.head(20))


Top 20 Features based on MLP Input Weights:
                   Feature  Importance
0              totalLength   59.217752
1                   weight   46.497941
3    airbladderTotalLength   38.232832
2                      sex   18.062693
20                     F53    5.747837
263                 F257.5    5.709038
264                   F258    5.633609
231                 F241.5    5.617083
254                   F253    5.500850
176                   F214    5.474938
268                   F260    5.459174
260                   F256    5.447944
4                      F45    5.377706
123                 F187.5    5.340965
235                 F243.5    5.339279
95                  F173.5    5.306186
266                   F259    5.305166
201                 F226.5    5.295300
259                 F255.5    5.276271
5                    F45.5    5.270753


In [None]:
joblib.dump(rf, 'rf_full.pkl')
joblib.dump(mlp, 'mlp_full.pkl')

### Alternate Models with no Body Measures

In [21]:
# Define columns to drop from predictors
drop_cols = ['fishNum', 'Spe', 'Index', 'Ping_time', 'totalLength', 'weight', 'airbladderTotalLength', 'sex']
# Create the feature matrix (drop non-predictive columns)
X = augmented_df.drop(columns=drop_cols, errors='ignore')
# The target variable is the Spe column
y = augmented_df['Spe']

In [22]:
# Define 5-Fold Cross-Validation with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

### Random Forest

In [23]:
# Initialize the RandomForestClassifier with a fixed random state
rf = RandomForestClassifier(n_estimators=100, random_state=seed)

In [24]:
# Compute the Cross-Validation Accuracy Scores:
cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='accuracy')

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

Cross-Validation Accuracy Scores: [0.8803125 0.8775    0.881875  0.8844375 0.8803125]
Mean Cross-Validation Accuracy: 0.8808875


In [25]:
# Fit the model on the entire dataset
rf.fit(X, y)

In [34]:
# Retrieve the feature importances from the model
feature_importances = rf.feature_importances_
features = X.columns
# Create a DataFrame for a more readable view of feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nTop 20 Features based on Random Forest Decision Importances:")
print(importance_df.head(20))


Top 20 Features based on Random Forest Decision Importances:
    Feature  Importance
1     F45.5    0.029995
2       F46    0.019482
231  F243.5    0.017584
60      F75    0.014937
184    F220    0.014637
183  F219.5    0.012848
0       F45    0.012819
61    F75.5    0.012587
197  F226.5    0.010409
62      F76    0.010200
224    F240    0.009384
181  F218.5    0.009341
167  F211.5    0.009222
89    F89.5    0.008661
182    F219    0.008424
3     F46.5    0.008135
263  F259.5    0.007991
239  F247.5    0.007754
31    F60.5    0.007721
32      F61    0.007597


### MLP

In [27]:
# Scale features; MLPs typically perform better with standardized data.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [28]:
# Define a 3-Layer MLP Model, with sizes 64, 32, and 16 respectively.
mlp = MLPClassifier(hidden_layer_sizes=(64, 32, 16), random_state = seed, max_iter=200)

In [29]:
# 5-Fold Cross-Validation for Accuracy
cv_scores = cross_val_score(mlp, X_scaled, y, cv=kf, scoring='accuracy')

print("5-Fold Cross-Validation Accuracy Scores (MLP):")
print(cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))



5-Fold Cross-Validation Accuracy Scores (MLP):
[0.8628125 0.8613125 0.862375  0.859625  0.8590625]
Mean Accuracy: 0.8610374999999999


In [30]:
# Train the Final Model on the Entire Dataset
mlp.fit(X_scaled, y)



In [31]:
# Estimate Feature Importance from the MLP
input_weights = mlp.coefs_[0]  # Shape: (n_features, number of neurons in first hidden layer)
feature_importance = np.abs(input_weights).sum(axis=1)  # Feature importances from MLP is provided via the absolute sum of the weights from the input layer

# Create a DataFrame for easier viewing
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print("\nTop 20 Features based on MLP Input Weights:")
print(importance_df.head(20))


Top 20 Features based on MLP Input Weights:
   Feature  Importance
0      F45   26.355109
4      F47   21.694309
89   F89.5   21.453992
10     F50   19.650862
57   F73.5   19.583011
90    F173   19.492092
5    F47.5   19.386095
30     F60   19.302300
1    F45.5   19.300325
87   F88.5   19.260069
18     F54   19.079521
52     F71   19.079511
12     F51   19.065036
60     F75   18.920941
2      F46   18.912251
21   F55.5   18.892512
3    F46.5   18.675359
11   F50.5   18.584550
45   F67.5   18.492279
39   F64.5   18.352145


In [33]:
joblib.dump(rf, 'rf_reduced.pkl')
joblib.dump(mlp, 'mlp_reduced.pkl')

['mlp_reduced.pkl']